Exporting Summary Data to CSV in R - r

Hello everyone I am working on a script that I would like to export to a CSV file.
Everything is working well with the exception that I would like to add column names and headers for the below data.
For instance variable A is the summary data of fixed income trades in 2017. I would like Row 1 in the output file to read as such.
Any help would be greatly appreciated. My code is written below. Thanks in advance!!
#SENDS THE RESULTS TO FILE CALLED OUTFILE.TXT WHICH IS OVERWRITTEN EACH TIME SCRIPT IS RUN
sink("outfile.csv")
#SHORT-TERM PRE-REFUNDED TRADE DATA
A = MSRB[which(MSRB$Coupon.Rate >= 2 & MSRB$Year == 2017 & MSRB$Par.Traded >=500 & MSRB$Class == "PRE-REFUNDED"),]
B = MSRB[which(MSRB$Coupon.Rate >= 2 & MSRB$Year == 2017 & MSRB$Par.Traded >=1000 & MSRB$Class == "PRE-REFUNDED"),]
C = MSRB[which(MSRB$Coupon.Rate >= 2 & MSRB$Year == 2018 & MSRB$Par.Traded >=500 & MSRB$Class == "PRE-REFUNDED"),]
D = MSRB[which(MSRB$Coupon.Rate >= 2 & MSRB$Year == 2018 & MSRB$Par.Traded >=1000 & MSRB$Class == "PRE-REFUNDED"),]
E = MSRB[which(MSRB$Coupon.Rate >= 2 & MSRB$Year == 2019 & MSRB$Par.Traded >=500 & MSRB$Class == "PRE-REFUNDED"),]
F = MSRB[which(MSRB$Coupon.Rate >= 2 & MSRB$Year == 2019 & MSRB$Par.Traded >=1000 & MSRB$Class == "PRE-REFUNDED"),]
#SUMMARY OF PRE-REFUNDED DATA
summary(A$Yield)
summary(B$Yield)
summary(C$Yield)
summary(D$Yield)
summary(E$Yield)
summary(F$Yield)
#END OF OUTPUT FILE
sink()

Related

How to implement a for loop when multiple dataframes are involved?

I am currently working in R and this is a samople of a task i was assigned:
POINA[1] <-sum(as.numeric(ifelse( (datos1$c_res==1 | datos1$c_res==3) & (datos1$r_def==0) & (datos1$eda>=15 & datos1$eda<= 98) & datos1$emp_ppal==1 & datos1$ambito1 != 1 ,1,0)))
POINA[2] <-sum(as.numeric(ifelse( (datos2$c_res==1 | datos2$c_res==3) & (datos2$r_def==0) & (datos2$eda>=15 & datos2$eda<= 98) & datos2$emp_ppal==1 & datos2$ambito1 != 1 ,1,0)))
POINA[3] <-sum(as.numeric(ifelse( (datos3$c_res==1 | datos3$c_res==3) & (datos3$r_def==0) & (datos3$eda>=15 & datos3$eda<= 98) & datos3$emp_ppal==1 & datos3$ambito1 != 1 ,1,0)))
POINA[4] <-sum(as.numeric(ifelse( (datos4$c_res==1 | datos4$c_res==3) & (datos4$r_def==0) & (datos4$eda>=15 & datos4$eda<= 98) & datos4$emp_ppal==1 & datos4$ambito1 != 1 ,1,0)))
POINA[5] <-sum(as.numeric(ifelse( (datos5$c_res==1 | datos5$c_res==3) & (datos5$r_def==0) & (datos5$eda>=15 & datos5$eda<= 98) & datos5$emp_ppal==1 & datos5$ambito1 != 1 ,1,0)))
POINA[6] <-sum(as.numeric(ifelse( (datos6$c_res==1 | datos6$c_res==3) & (datos6$r_def==0) & (datos6$eda>=15 & datos6$eda<= 98) & datos6$emp_ppal==1 & datos6$ambito1 != 1 ,1,0)))
POINA[7] <-sum(as.numeric(ifelse( (datos7$c_res==1 | datos7$c_res==3) & (datos7$r_def==0) & (datos7$eda>=15 & datos7$eda<= 98) & datos7$emp_ppal==1 & datos7$ambito1 != 1 ,1,0)))
POINA[8] <-sum(as.numeric(ifelse( (datos8$c_res==1 | datos8$c_res==3) & (datos8$r_def==0) & (datos8$eda>=15 & datos8$eda<= 98) & datos8$emp_ppal==1 & datos8$ambito1 != 1 ,1,0)))
POINA[9] <-sum(as.numeric(ifelse( (datos9$c_res==1 | datos9$c_res==3) & (datos9$r_def==0) & (datos9$eda>=15 & datos9$eda<= 98) & datos9$emp_ppal==1 & datos9$ambito1 != 1 ,1,0)))
POINA[10] <-sum(as.numeric(ifelse( (datos10$c_res==1 | datos10$c_res==3) & (datos10$r_def==0) & (datos10$eda>=15 & datos10$eda<= 98) & datos10$emp_ppal==1 & datos10$ambito1 != 1 ,1,0)))
I have several dataframes, that for sake of simplicity, are named "datos1".. "datos120". This dataframes are the results of telephone polls. Each dataframe contains different individuals and each poll corresponds to a specific week in the year.
POINA[i] is a numeric vector where each entry is the total sum of surveyed people who fit the criterion specified above.
As can be seen, the criteria remains the same every week but, since each week is a diferent frame datos[i] changes for every POINA[i].
Is there a way such that i dont have to write the 120 weeks one by one?
I have tried doing it manually but there are just to many cases so, any help in making this more efficient would be deeply apreciated

Using data.table in r to eliminate inner for loop

I have an inner for-loop in R which I have identified as significant bottleneck in my code. The script simulates the effect of a time-varying policy on individuals prior to adulthood. The outer loop runs over a list of cohorts (yob = 1910,...,1930 etc.) that I would like to study. The inner loop counts from ages from a = 5 to a = 17. CSL.details is a data.table that contains the details of each law that I am studying in form of the variables I grab, which vary by year = birthyear + a. To understand the overall effects of the policy by birth cohort, I need to track ca_years1, ca_years2, ca_years3, and ca_years4 for each a.
ages = seq.int(5,17)
state = "Massachusetts"
yob = seq.int(1910, 1930)
for (birthyear in yob){
ca_years1 = 0; ca_years2 = 0; ca_years3 = 0; ca_years4 = 0;
for (a in ages){
thisyear = birthyear + a
# Grab each law for given state and year and implement exemption permit
thislaw <- CSL.details[statename == state & yob == birthyear & thisyear == year]
if (nrow(thislaw) == 0) next
exempt_workpermit = (ca_years2 >= thislaw$workyrs & a >= thislaw$workage & thislaw$workage > 0)
exempt_yearstodropout = (ca_years3 >= thislaw$earlyyrs & a >= thislaw$earlyyrs_condition & thislaw$earlyyrs > 0)
exempt_cont = ((ca_years2 + ca_years4) >= thislaw$contyrs & thislaw$contyrs > 0)
# Increment each law when school is required
if(thislaw$entryage <= a & a < thislaw$exitage){
ca_years1 = ca_years1 + 1
if(!exempt_workpermit){ca_years2 = ca_years2 + 1}
if(!exempt_yearstodropout){ca_years3 = ca_years3 + 1}
}
if(thislaw$contage > a &
a >= thislaw$workage &
!exempt_cont &
thislaw$workage > 0 &
!(thislaw$entryage <= a & a < thislaw$exitage & !exempt_workpermit)
){ca_years4 = ca_years4 + 1}
}
CSL.exposures[statename == state & yob == birthyear]$ca_years1 = ca_years1
CSL.exposures[statename == state & yob == birthyear]$ca_years2 = ca_years2
CSL.exposures[statename == state & yob == birthyear]$ca_years3 = ca_years3
CSL.exposures[statename == state & yob == birthyear]$ca_years4 = ca_years4
}
Is there a data.table solution for replacing the inner-loop? I am an intermediate R coder and it is a bit difficult to think of how to get started. Although I would prefer data.table exclusively, I am open to dplyr-type solutions if they significantly speed up the code.
Edit: here is an example of what CSL.detail looks like, as a copy-pasted data.table.
statename year yob statefip entryage exitage earlyyrs earlyyrs_condition workage workyrs contage contyrs statecompschoolyr
1: Massachusetts 1913 1800 25 7 16 4 14 14 4 16 0 1852
2: Massachusetts 1913 1801 25 7 16 4 14 14 4 16 0 1852
3: Massachusetts 1913 1802 25 7 16 4 14 14 4 16 0 1852
4: Massachusetts 1913 1803 25 7 16 4 14 14 4 16 0 1852
5: Massachusetts 1913 1804 25 7 16 4 14 14 4 16 0 1852
I managed to refactor the code to solve the problem. The key idea is to exploit state and yob as grouping variables (since all calculations happen within a state and yob pair). This completely eliminates the outer loops and requires only a single loop, iterating by age. I am just saving this answer here for reference, but I am not sure that there is a broader lesson for the stackoverflow.com community so feel free to delete. The time savings are on the order of 95%, primarily because it reduces the overhead time to call data.table.
for(a in ages){
# grab running total of years of education compelled by state and year of birth
CSL.details[CSL.exposures, on = .(statename, yob),
`:=` (ca_years1 = i.ca_years1,
ca_years2 = i.ca_years2,
ca_years3 = i.ca_years3,
ca_years4 = i.ca_years4)] %>%
.[year == a + yob,
`:=`(
# create exemptions by age based on number of years of schooling completed
exempt_workpermit = (ca_years2 >= workyrs & a >= workage & workage > 0),
exempt_yearstodropout = (ca_years3 >= earlyyrs & a >= earlyyrs_condition & earlyyrs > 0),
exempt_cont = ((ca_years2 + ca_years4) >= contyrs & contyrs > 0)
), by = .(statename, yob)]
CSL.exposures[
CSL.details[year == a + yob], on = .(yob, statename),
`:=` (exempt_workpermit = i.exempt_workpermit, exempt_yearstodropout = i.exempt_yearstodropout,
exempt_cont = i.exempt_cont, entryage = i.entryage,
exitage = i.exitage, contage = i.contage, workage = i.workage) ] %>%
.[ ,
`:=` (
ca_years1 =
fifelse(entryage <= a & a < exitage,
ca_years1 + 1, ca_years1, na = as.numeric(ca_years1)),
ca_years2 =
fifelse(entryage <= a & a < exitage & !exempt_workpermit,
ca_years2 + 1, ca_years2, na = as.numeric(ca_years2)),
ca_years3 =
fifelse(entryage <= a & a < exitage & !exempt_yearstodropout,
ca_years3 + 1, ca_years3, na = as.numeric(ca_years3)),
ca_years4 =
fifelse(contage > a & a >= workage & !exempt_cont &
workage > 0 &
!(entryage <= a & a < exitage & !exempt_workpermit),
ca_years4 + 1, ca_years4, na = as.numeric(ca_years4))),
by = .(statename, yob)
]
}

Create new variable in R with assumptions from SPSS file

I've read in my SPSS file in R and want to recode a new variable if such and such assumptions are made. To be specific:
I want to turn my spssdata_sub$gest variable into a new variable if the following the conditions are met:
spssdata_sub$indusert != 2 & spssdata_sub$ivf != 1 & spssdata_sub$leie != 3 & spssdata_sub$svkompl_II != 7 & spssdata_sub$svkompl_II != 2 & spssdata_sub$svkompl_II != 1
Anyone here who can help me with a code?
Does one of the following codes work for you?
Either this adapted version of Renu's solution
spssdata_sub$gest <- ifelse(spssdata_sub$indusert != 2 & spssdata_sub$ivf != 1 & spssdata_sub$leie != 3 & spssdata_sub$svkompl_II != 7 & spssdata_sub$svkompl_II != 2 & spssdata_sub$svkompl_II != 1, spssdata_sub$gest, NA)
or this code for filtering observations:
library(dplyr)
spssdata_sub_new <- spssdata_sub %>%
filter(indusert != 2 & ivf != 1 & leie != 3 & svkompl_II != 7 & svkompl_II != 2 & ssvkompl_II != 1)
One way is the following, if you really mean either one of the conditions
Mynewdata <- dplyr::filter(spssdata, indusert != 2, ivf != 1, leie != 3,
svkompl_II != 7 & svkompl_II != 2 & svkompl_II != 1)
only keeps entries that are neither, or putting it the other way exludes entries that have either indusert = 2 or ivf = 1 etc... one of the condition is enough to exclude it.
add-on: or something also like that:
Mynewdata <- dplyr::filter(spssdata, indusert != 2, ivf != 1, leie != 3,
!(svkompl_II %in% c(7,2,1))

How to write this ifelse statement in R correctly

I want to return a value in a column, or NA, contingent on values in other columns.
I basically want to see if the value in the column meets the first test criteria:
df$v2.1 >= df$varx & df$v3.1 <6
if not does it meet the second:
df$v4.1 >= df$vary & df$v5.1 >5
and then if neither return NA
The code I have tried is below.
df$v1.1 = ifelse(df$v2.1 >= df$varx & df$v3.1 <6 || df$v4.1 >= df$vary & df$v5.1 >5 ,df$v1.1, NA)
Your only mistake is using || rather than |. || is not vectorised, and only considers the first element. All your other operators (and ifelse()) are vectorised, so the following should work as expected:
df$v1.1 = ifelse(df$v2.1 >= df$varx & df$v3.1 <6 | df$v4.1 >= df$vary & df$v5.1 > 5, df$v1.1, NA)
A good way to check when you're doing reasonably complex or multiple logical operations is to run each one of them and see if you're getting the expected output. If you run:
df$v2.1 >= df$varx & df$v3.1 <6
or
df$v4.1 >= df$vary & df$v5.1 > 5
you should get a vector of logical values. If you run:
df$v2.1 >= df$varx & df$v3.1 <6 || df$v4.1 >= df$vary & df$v5.1 > 5
you should get a single logical value. In your case, that will give a single result from the ifelse(), which then gets recycled to fill df$v1.1.
From what I can tell df$v1.1 is already defined, so you only need to modify those rows that fail the test in your ifelse. The following might be easier:
df$v1.1[
which(
!(df$v2.1 >= df$varx & df$v3.1 <6) & !(df$v4.1 >= df$vary & df$v5.1 >5))
] <- NA

R: recommendation on how to compute new columns on multiple condition of others for every row in data.frame

For every entry in rows i need to compute two variables as new columns in a data.frame depending conditional on more than 60 other columns. I would like your recommendation on how to realize that elegant (while and for, with, ifelse, foreach, by or ddply?). I don't like to do that manually like i did for the first cases in the example code and i don't care for performance.
Further: Probably i would not need to ask if i would have understood how to use functions like transform (with ddply or by) and what they do. Thus i hope you can recommend good tutorials on that, maybe relating to my case. I found a lot but in different context and was not able to comprehend it entrily or transcribe it for my case.
My case: I have three columns for each of 20 events representing the kind and date of that event. For each row I need to compute (and save to that data.frame) the difference in time between one special event (depending on whether a special kind happened before or after another) and a date fixed for every entry in rows. Furthermore i need to save the date of that event.
This is how i did (it works, but it is running only through the first cases):
#event.2 (1. event month), event.3 (1. event year), event.4 (1. event kind), event.5 (2. event month), event.6 (2. event year), ...
df$dit[(!is.na(df$event.2) & !is.na(df$event.3) & !is.na(df$event.4) & !is.na(df$event.5) & !is.na(df$event.6) & !is.na(df$event.7))
& (
(df$event.4 == 3 & ((1/12*df$event.2)+df$event.3) > df$fixdate) & (df$event.7 == 1 | df$event.7 == 2)
)] = ((1/12*df$event.2)+df$event.3) - df$fixdate
df$date[(!is.na(df$event.2) & !is.na(df$event.3) & !is.na(df$event.4) & !is.na(df$event.5) & !is.na(df$event.6) & !is.na(df$event.7))
& (
(df$event.4 == 3 & ((1/12*df$event.2)+df$event.3) > df$fixdate) & (df$event.7 == 1 | df$event.7 == 2)
)] = ((1/12*df$event.2)+df$event.3)
df$dit[(!is.na(df$event.2) & !is.na(df$event.3) & !is.na(df$event.4) & !is.na(df$event.5) & !is.na(df$event.6) & !is.na(df$event.7))
& (
(df$event.4 == 1 & ((1/12*df$event.2)+df$event.3) > df$fixdate)
| (df$event.4 == 2 & ((1/12*df$event.2)+df$event.3) > df$fixdate)
)] = 0
df$date[(!is.na(df$event.2) & !is.na(df$event.3) & !is.na(df$event.4) & !is.na(df$event.5) & !is.na(df$event.6) & !is.na(df$event.7))
& (
(df$event.4 == 1 & ((1/12*df$event.2)+df$event.3) > df$fixdate)
| (df$event.4 == 2 & ((1/12*df$event.2)+df$event.3) > df$fixdate)
)] = df$fixdate
df$dit[(!is.na(df$event.2) & !is.na(df$event.3) & !is.na(df$event.4) & !is.na(df$event.5) & !is.na(df$event.6) & !is.na(df$event.7))
& (
(
(df$event.4 == 1 & ((1/12*df$event.2)+df$event.3) < df$fixdate)
& (
(df$event.7 == 1 & ((1/12*df$event.5)+df$event.6) > df$fixdate)
| (df$event.7 == 2 & ((1/12*df$event.5)+df$event.6) > df$fixdate)
)
)
|
(
(df$event.4 == 2 & ((1/12*df$event.2)+df$event.3) < df$fixdate)
& (
(df$event.7 == 1 & ((1/12*df$event.5)+df$event.6) > df$fixdate)
| (df$event.7 == 2 & ((1/12*df$event.5)+df$event.6) > df$fixdate)
)
)
)] = ((1/12*df$event.5)+df$event.6) - df$fixdate
df$date[(!is.na(df$event.2) & !is.na(df$event.3) & !is.na(df$event.4) & !is.na(df$event.5) & !is.na(df$event.6) & !is.na(df$event.7))
& (
(
(df$event.4 == 1 & ((1/12*df$event.2)+df$event.3) < df$fixdate)
& (
(df$event.7 == 1 & ((1/12*df$event.5)+df$event.6) > df$fixdate)
| (df$event.7 == 2 & ((1/12*df$event.5)+df$event.6) > df$fixdate)
)
)
|
(
(df$event.4 == 2 & ((1/12*df$event.2)+df$event.3) < df$fixdate)
& (
(df$event.7 == 1 & ((1/12*df$event.5)+df$event.6) > df$fixdate)
| (df$event.7 == 2 & ((1/12*df$event.5)+df$event.6) > df$fixdate)
)
)
)] = ((1/12*df$event.5)+df$event.6)
You can define your conditions as expressions and use them within transform. The idea is to factorize your conditions at most as possible .
COND1 <- expression(!is.na(event.2) & !is.na(event.3) &
!is.na(event.4) & !is.na(event.5) &
!is.na(event.6) & !is.na(event.7))
COND2 <- expression(event.4 == 3 & ((1/12*event.2)+event.3) > fixdate) &
(event.7 == 1 | event.7 == 2))
COND3 <- expression(event.4 == 1 & ((1/12*event.2)+event.3) > fixdate)
COND4 <- expression(event.4 == 2 & ((1/12*event.2)+event.3) > fixdate)
### you continue here with the rest of conditions....
Then using them within transform you can do something like:
transform(df, date = ifelse(eval(COND1) & eval(COND2),((1/12*event.2)+event.3),NA),
transform(df, date = ifelse(eval(COND1) & (eval(COND3)|eval(COND4)),fixdate,NA))
## Note also that the seond "dit" variable is deduced from "date"
transform(df,dit=date-fixdate)

Resources