unable to generate matrix in desired format - r
I need to make a function such that it create a matrix across two groups and provide total of the groups at row and column level, along with matrix results
Inputs for functions -> df
credit breaks
Rate_Cutpoints from 1..5
O/P needs to be final table provide below
MY Data Frame
credit <- c(10,20,30,40,10,30,50,70,90,100,25,45,67,87,98,54,34,56,78,23,45,56,12)
rate <- c(1,2,3,4,1,3,5,7,9,10,2,4,6,8,9,5,3,5,7,2,4,5,1)
Marks <- c(9,3,5,6,7,8,9,1,3,10,4,5,6,7,5,4,8,3,5,6,7,8,9)
Points <- c(1,2,3,4,5,6,7,8,9,10,2,3,4,4,5,7,8,3,4,5,6,7,8)
Scale <- c(1,2,3,4,5,6,7,8,9,10,2,3,4,4,5,7,8,3,4,5,6,7,8)
Category <- c('book', 'pen', 'textbook', 'pencil_case','book', 'pen', 'textbook', 'pencil_case','book', 'pen', 'textbook', 'pencil_case','book','pen' ,'pen', 'textbook', 'pencil_case','book', 'pen', 'textbook', 'pencil_case','book', 'pencil_case')
# Join the variables to create a data frame
df <- data.frame(credit,rate,Marks,Points,Scale,Category)
MY Inputs
credit_breaks<-c(0,15,30,45,65,75,1000)
Rate_Cutpoints1<-c(0,1,2,5,7,9,10)
Rate_Cutpoints2<-c(0,3,4,7,8,9,10)
Rate_Cutpoints3<-c(0,1,5,6,8,9,10)
Rate_Cutpoints4<-c(0,1,3,6,7,9,10)
Rate_Cutpoints5<-c(0,2,3,4,8,9,10)
Rate_Cutpoints6<-c(0,3,4,5,7,9,10)
MY code it basically first make credit band column from credit breaks which is provided as input and then using that make another column as rate bands based on rate breaks
calculate few metric and then provide metrics
df1<-df %>% mutate(Credit_Band = cut(credit,include.lowest = TRUE,right=TRUE,
breaks =credit_breaks ,labels = FALSE))
df2<-df1 %>%
group_by(credit) %>%
mutate(New_Band =
(ifelse(Credit_Band==1, (cut(rate, Rate_Cutpoints1 ,labels = FALSE)),
ifelse(Credit_Band==2, (cut(rate, Rate_Cutpoints2 ,labels = FALSE)),
ifelse(Credit_Band==3, (cut(rate, Rate_Cutpoints3 ,labels = FALSE)),
ifelse(Credit_Band==4, (cut(rate, Rate_Cutpoints4 ,labels = FALSE)),
ifelse(Credit_Band==5, (cut(rate, Rate_Cutpoints5,labels = FALSE)),
ifelse(Credit_Band==6, (cut(rate, Rate_Cutpoints6,labels = FALSE)),
NA))))))))
df2<-as.data.frame(df2)
summary_results<-df2%>%
group_by(Credit_Band,New_Band)%>%
dplyr::summarize(dist = n()/nrow(df2),
count =n(),
avg_marks= sum(Marks, na.rm=TRUE),
sum_points = sum(Points,na.rm = TRUE),
sum_scale = sum(Scale,na.rm = TRUE))
summary_results$final<-summary_results$avg_marks/summary_results$sum_points
results<-reshape2::dcast(data = summary_results,formula = Credit_Band~New_Band,
value.var = "final")
my result o/p is cross tab across credit and rate bands
Then below code is to calculate total across credit and rate bands
total_rows_value=df2%>% group_by(New_Band)%>%
dplyr::summarize(sum_points = sum(Points ,na.rm = TRUE),
avg_marks= sum(Marks, na.rm=TRUE),
)
total_rows_value$final<-total_rows_value$avg_marks/total_rows_value$sum_points
total_cols_vals=df2%>% group_by(Credit_Band)%>%
dplyr::summarize(sum_points = sum(Points ,na.rm = TRUE),
avg_marks= sum(Marks, na.rm=TRUE),
)
total_cols_vals$final<-total_cols_vals$avg_marks/total_cols_vals$sum_points
Now MY above O/P needs to be clubbed in a fashion to generate below matrix as Final O/P desired
Credit_Band 1 2 3 4 5 6 TotalCols
1 1.78 NA NA NA NA NA. 1.79
2 1.44 NA NA NA NA NA. 1.44
3 NA 1.23 NA NA NA NA. 1.24
4 NA NA. 1 NA NA NA 1
5 NA NA NA 0.58 NA NA 0.58
6 NA NA NA 1.25 0.83 1 0.93
Total_R 1.59. 1.24 1. 0.75. 0.83. 1
(results_body <- results[,-1])
(results_rownames <- results[,1])
(fin <- cbind(
rbind(results_body,total_rows_value$final),
totcol = c(total_cols_vals$final,NA)))
rownames(fin) <-c(results_rownames,"Total_R")
> round(fin,2)
1 2 3 4 5 6 totcol
1 1.79 NA NA NA NA NA 1.79
2 1.44 NA NA NA NA NA 1.44
3 NA 1.24 NA NA NA NA 1.24
4 NA NA 1 NA NA NA 1.00
5 NA NA NA 0.58 NA NA 0.58
6 NA NA NA 1.25 0.83 1 0.94
Total_R 1.59 1.24 1 0.75 0.83 1 NA
Related
Update dt columns based on named list
Let's say, I have the following my_dt datatable: neutrons spectrum geography 2.30 -1.2 KIEL 2.54 -1.6 KIEL 2.56 -0.9 JUNG 2.31 -0.3 ANT Also I have the following named list (my_list): > my_list $particles [1] "neutrons" $station [1] NA $energy [1] "spectrum" $area [1] "geography" $gamma [1] NA The values of this list correspond to the columns names from my dataset (if they exist, if they are absent - NA). Based on my dataset and this list, I need to check which columns exist in my_dt and rename them (based on my_list names), and for NA values - I need to create columns filled with NAs. So, I want to obtain the following dataset: >final_dt particles station energy area gamma 2.30 NA -1.2 KIEL NA 2.54 NA -1.6 KIEL NA 2.56 NA -0.9 JUNG NA 2.31 NA -0.3 ANT NA I try to implement this using apply family functions, but at the moment I can't obtain exactly what I want. So, I would be grateful for any help!
data.table using lapply library(data.table) setDT(my_dt) setDT(my_list) final_dt <- setnames( my_list[, lapply( .SD, function(x){ if( x %in% colnames(my_dt)){ my_dt[,x,with=F] }else{ NA } } ) ], names(my_list) ) final_dt particles station energy area gamma 1: 2.30 NA -1.2 KIEL NA 2: 2.54 NA -1.6 KIEL NA 3: 2.56 NA -0.9 JUNG NA 4: 2.31 NA -0.3 ANT NA base R using sapply setDF(my_dt) setDF(my_list) data.frame( sapply( my_list, function(x) if(!is.na(x)){ my_dt[,x] }else{ NA } ) ) particles station energy area gamma 1 2.30 NA -1.2 KIEL NA 2 2.54 NA -1.6 KIEL NA 3 2.56 NA -0.9 JUNG NA 4 2.31 NA -0.3 ANT NA Data my_dt <- structure(list(neutrons = c(2.3, 2.54, 2.56, 2.31), spectrum = c(-1.2, -1.6, -0.9, -0.3), geography = c("KIEL", "KIEL", "JUNG", "ANT" )), class = "data.frame", row.names = c(NA, -4L)) my_list <- list(particles = "neutrons", station = NA, energy = "spectrum", area = "geography", gamma = NA)
This may not meet your needs, but since I had come up with this separately thought I would share just in case. You can use setnames to rename the columns based on my_list. After that, add in the missing column names with values of NA. Finally, you can use setcolorder to reorder based on your list if desired. library(data.table) my_vec <- unlist(my_list) setnames(my_dt, names(my_vec[match(names(my_dt), my_vec)])) my_dt[, (setdiff(names(my_vec), names(my_dt))) := NA] setcolorder(my_dt, names(my_vec)) my_dt Output particles station energy area gamma 1: 2.30 NA -1.2 KIEL NA 2: 2.54 NA -1.6 KIEL NA 3: 2.56 NA -0.9 JUNG NA 4: 2.31 NA -0.3 ANT NA
I wrote a simple code that should do the job for you: l = list(c = 'cc', a = 'aa', b = NA) # replace this with your my_list dt = data.frame(aa = 1:3, cc = 2:4) # replace this with my_dt dtl = data.frame(l) names(dt) = names(l)[na.omit(match(l, names(dt)))] m = merge(dt, dtl[!is.element(names(dtl), names(dt))])
How to group by and fill NA with closest not NA in R dataframe column with condition on another column
I have a data frame of blood test markers results and I want to fill in the NA's by the following criteria: For each group of ID (TIME is in ascending order) if the marker value is NA then fill it with the closest not NA value in this group (may be past or future) but only if the time difference is less than 14. this example of my data: df<-data.frame(ID=c(rep(2,5),rep(4,3)), TIME =c(1,22,33,43,85,-48,1,30), CEA = c(1.32,1.42,1.81,2.33,2.23,29.7,23.34,18.23), CA.15.3 = c(14.62,14.59,16.8,22.34,36.33,56.02,94.09,121.5), CA.125 = c(33.98,27.56,30.31,NA,39.57,1171.00,956.50,825.30), CA.19.9 = c(6.18,7.11,5.72, NA, 7.38,39.30,118.20,98.26), CA.72.4 = c(rep(NA,5),1.32, NA, NA), NSE = c(NA, 13.21, rep(NA,6))) ID TIME CEA CA.15.3 CA.125 CA.19.9 CA.72.4 NSE 2 1 1.32 14.62 33.98 6.18 NA NA 2 22 1.42 14.59 27.56 7.11 NA 13.21 2 33 1.81 16.80 30.31 5.72 NA NA 2 43 2.33 22.34 NA NA NA NA 2 85 2.23 36.33 39.57 7.38 NA NA 4 -48 29.70 56.02 1171.00 39.30 1.32 NA 4 1 23.34 94.09 956.50 118.20 NA NA 4 30 18.23 121.50 825.30 98.26 NA NA ID is the patient. The TIME is the time of the blood test. The others are the markers. The only way I could do it is with loops which I try to avoid as much as possible. I expect the output to be: ID TIME CEA CA.15.3 CA.125 CA.19.9 CA.72.4 NSE 2 1 1.32 14.62 33.98 6.18 NA NA 2 22 1.42 14.59 27.56 7.11 NA 13.21 2 33 1.81 16.80 30.31 5.72 NA 13.21 2 43 2.33 22.34 30.31 5.72 NA NA 2 85 2.23 36.33 39.57 7.38 NA NA 4 -48 29.70 56.02 1171.00 39.30 1.32 NA 4 1 23.34 94.09 956.50 118.20 NA NA 4 30 18.23 121.50 825.30 98.26 NA NA CA.19.9 and CA.124 are filled with the previous (10 days before) NSE filled with the previous (11 days) CA.72.4 not filled since the time difference of 1.32 which is -48 is 49 days from the next measure.
I bet there is a much simpler, vectorized solution but the following works. fill_NA <- function(DF){ sp <- split(df, df$ID) sp <- lapply(sp, function(DF){ d <- diff(DF$TIME) i_diff <- c(FALSE, d < 14) res <- sapply(DF[-(1:2)], function(X){ inx <- i_diff & is.na(X) if(any(inx)){ inx <- which(inx) last_change <- -1 for(i in inx){ if(i > last_change + 1){ if(i == 1){ X[i] <- X[i + 1] }else{ X[i] <- X[i - 1] } last_change <- i } } } X }) cbind(DF[1:2], res) }) res <- do.call(rbind, sp) row.names(res) <- NULL res } fill_NA(df) # ID TIME CEA CA.15.3 CA.125 CA.19.9 CA.72.4 NSE #1 2 1 1.32 14.62 33.98 6.18 NA NA #2 2 22 1.42 14.59 27.56 7.11 NA 13.21 #3 2 33 1.81 16.80 30.31 5.72 NA 13.21 #4 2 43 2.33 22.34 30.31 5.72 NA NA #5 2 85 2.23 36.33 39.57 7.38 NA NA #6 4 -48 29.70 56.02 1171.00 39.30 1.32 NA #7 4 1 23.34 94.09 956.50 118.20 NA NA #8 4 30 18.23 121.50 825.30 98.26 NA NA
Yes, you can have a vectorized solution. first let us consider the case in which you only impute using the future value. You need to create few auxiliary variables: a variable that tells you whether the next observation belong to the same id (so it can be used to impute), a variable that tells you whether the next observation is less than 14 days apart from the current one. These do not depend on the specific variable you want to impute. for each variable to be imputed you will also need a variable that tells you whether the next variable is missing. Then you can vectorize the following logic: when the next observation has the same id, and when it is less than 14 days from the current one and it is not missing copy its value in the current one. Things get more complicated when you need to decide whether to use the past or future value, but the logic is the same. the code is below, it is a bit long but you can simplify it, I just wanted to be clear about what it does. Hope this helps x <-data.frame(ID=c(rep(2,5),rep(4,3)), TIME =c(1,22,33,43,85,-48,1,30), CEA = c(1.32,1.42,1.81,2.33,2.23,29.7,23.34,18.23), CA.15.3 = c(14.62,14.59,16.8,22.34,36.33,56.02,94.09,121.5), CA.125 = c(33.98,27.56,30.31,NA,39.57,1171.00,956.50,825.30), CA.19.9 = c(6.18,7.11,5.72, NA, 7.38,39.30,118.20,98.26), CA.72.4 = c(rep(NA,5),1.32, NA, NA), NSE = c(NA, 13.21, rep(NA,6))) ### these are the columns we want to input cols.to.impute <- colnames(x)[! colnames(x) %in% c("ID","TIME")] ### is the next id the same? x$diffidf <- NA x$diffidf[1:(nrow(x)-1)] <- diff(x$ID) x$diffidf[x$diffidf > 0] <- NA ### is the previous id the same? x$diffidb <- NA x$diffidb[2:nrow(x)] <- diff(x$ID) x$diffidb[x$diffidb > 0] <- NA ### diff in time with next observation x$difftimef <- NA x$difftimef[1:(nrow(x)-1)] <- diff(x$TIME) ### diff in time with previous observation x$difftimeb <- NA x$difftimeb[2:nrow(x)] <- diff(x$TIME) ### if next (previous) id is not the same time difference is not meaningful x$difftimef[is.na(x$diffidf)] <- NA x$difftimeb[is.na(x$diffidb)] <- NA ### we do not need diffid anymore (due to previous statement) x$diffidf <- x$diffidb <- NULL ### if next (previous) point in time is more than 14 days it is not useful for imputation x$difftimef[abs(x$difftimef) > 14] <- NA x$difftimeb[abs(x$difftimeb) > 14] <- NA ### create variable usef that tells us whether we should attempt to use the forward observation for imputation ### it is 1 only if difftime forward is less than difftime backward x$usef <- NA x$usef[!is.na(x$difftimef) & x$difftimef < x$difftimeb] <- 1 x$usef[!is.na(x$difftimef) & is.na(x$difftimeb)] <- 1 x$usef[is.na(x$difftimef) & !is.na(x$difftimeb)] <- 0 if (!is.na(x$usef[nrow(x)])) stop("\nlast observation usef is not missing\n") ### now we get into column specific operations. for (col in cols.to.impute){ ### we will store the results in x$imputed, and copy into c[,col] at the end x$imputed <- x[,col] ### x$usef needs to be modified depending on the specific column, so we define a local version of it x$usef.local <- x$usef ### if a variable is not missing no point in looking at usef.local, so we make it missing x$usef.local[!is.na(x[,col])] <- NA ### when usef.local is 1 but the next observation is missing it cannot be used for imputation, so we ### make it 0. but a value of 0 does not mean we can use the previous observation because that may ### be missing too. so first we make usef 0 and next we check the previous observation and if that ### is missing too we make usef missing x$previous.value <- c(NA,x[1:(nrow(x)-1),col]) x$next.value <- c(x[2:nrow(x),col],NA) x$next.missing <- is.na(x$next.value) x$previous.missing <- is.na(x$previous.value) x$usef.local[x$next.missing & x$usef.local == 1] <- 0 x$usef.local[x$previous.missing & x$usef.local == 0] <- NA ### now we can impute properly: use next value when usef.local is 1 and previous value when usef.local is 0 tmp <- rep(FALSE,nrow(x)) tmp[x$usef.local == 1] <- TRUE x$imputed[tmp] <- x$next.value[tmp] tmp <- rep(FALSE,nrow(x)) tmp[x$usef.local == 0] <- TRUE x$imputed[tmp] <- x$previous.value[tmp] ### copy to column x[,col] <- x$imputed } ### get rid of useless temporary stuff x$previous.value <- x$previous.missing <- x$next.value <- x$next.missing <- x$imputed <- x$usef.local <- NULL ID TIME CEA CA.15.3 CA.125 CA.19.9 CA.72.4 NSE difftimef difftimeb usef 1 2 1 1.32 14.62 33.98 6.18 NA NA NA NA NA 2 2 22 1.42 14.59 27.56 7.11 NA 13.21 11 NA 1 3 2 33 1.81 16.80 30.31 5.72 NA 13.21 10 11 1 4 2 43 2.33 22.34 30.31 5.72 NA NA NA 10 0 5 2 85 2.23 36.33 39.57 7.38 NA NA NA NA NA 6 4 -48 29.70 56.02 1171.00 39.30 1.32 NA NA NA NA 7 4 1 23.34 94.09 956.50 118.20 NA NA NA NA NA 8 4 30 18.23 121.50 825.30 98.26 NA NA NA NA NA >
R time series interpolation, and extrapolation of a specific value
I have daily values for 11 different yield curves, that is time series for 11 yield maturities (1yr, 2yr, 3yr, 4yr, 5yr, 7yr, 10yr, 15yr, 20yr, 25yr, 30yr) in the same period of time. Some of the yields in some days are missing (NAs) and I'd like to extrapolate their values knowing the value of the other yields at the same day. This should be done by a first linear interpolation of the available yields in a given day, and a successive extrapolation of the missing yields in the same day, using the maturity duration (1yr, 2yr, etc) as weight. For example, I have the following data set and I'd like to extrapolate the daily value for 5yr yield based on an interpolation of all available yilds at the same day: Date 1 2 3 4 5 7 10 15 20 25 30 7/4/2007 9.642 9.278 8.899 NA NA 8.399 8.241 8.183 8.117 NA NA 7/5/2007 9.669 9.302 8.931 NA NA 8.44 8.287 8.231 8.118 NA NA 7/6/2007 9.698 9.331 8.961 NA NA 8.437 8.295 8.243 8.13 NA NA 7/9/2007 9.678 9.306 8.941 NA NA 8.409 8.269 8.214 8.092 NA NA 7/10/2007 9.65 9.283 8.915 NA NA 8.385 8.243 8.185 8.065 NA NA 7/11/2007 9.7 9.342 8.976 NA NA 8.445 8.306 8.249 8.138 NA NA 7/12/2007 9.703 9.348 8.975 NA NA 8.448 8.303 8.245 8.152 NA NA 7/13/2007 9.69 9.334 8.965 NA NA 8.439 8.294 8.24 8.145 NA NA 7/16/2007 9.683 9.325 8.964 NA NA 8.442 8.299 8.244 8.158 NA NA 7/17/2007 9.712 9.359 8.987 NA NA 8.481 8.33 8.277 8.192 NA NA 7/18/2007 9.746 9.394 9.018 NA NA 8.509 8.363 8.311 8.22 NA NA ... Does anyone have suggestions on how to do it? Thanks.
This is one of the ways to build a linear model for each Date based on the available info you have and use it to predict/estimate the value at year 5. Run the process step by step to see how it works. Check the estimations to make sure they make sense. dt = read.table(text= "Date 1 2 3 4 5 7 10 15 20 25 30 7/4/2007 9.642 9.278 8.899 NA NA 8.399 8.241 8.183 8.117 NA NA 7/5/2007 9.669 9.302 8.931 NA NA 8.44 8.287 8.231 8.118 NA NA 7/6/2007 9.698 9.331 8.961 NA NA 8.437 8.295 8.243 8.13 NA NA 7/9/2007 9.678 9.306 8.941 NA NA 8.409 8.269 8.214 8.092 NA NA 7/10/2007 9.65 9.283 8.915 NA NA 8.385 8.243 8.185 8.065 NA NA 7/11/2007 9.7 9.342 8.976 NA NA 8.445 8.306 8.249 8.138 NA NA 7/12/2007 9.703 9.348 8.975 NA NA 8.448 8.303 8.245 8.152 NA NA 7/13/2007 9.69 9.334 8.965 NA NA 8.439 8.294 8.24 8.145 NA NA 7/16/2007 9.683 9.325 8.964 NA NA 8.442 8.299 8.244 8.158 NA NA 7/17/2007 9.712 9.359 8.987 NA NA 8.481 8.33 8.277 8.192 NA NA 7/18/2007 9.746 9.394 9.018 NA NA 8.509 8.363 8.311 8.22 NA NA", header=T) library(dplyr) library(tidyr) dt %>% gather(time, value, -Date) %>% # reshape dataset filter(!is.na(value)) %>% # ignore NA values mutate(time = as.numeric(gsub("X","",time))) %>% # get rid of the X created by importing data group_by(Date) %>% # for each date do({model = lm(value~time, data=.) # build a linear model data.frame(pred = predict(model, data.frame(time=5)))}) # use model to predict at time = 5 # Source: local data frame [11 x 2] # Groups: Date [11] # # Date pred # (fctr) (dbl) # 1 7/10/2007 8.920932 # 2 7/11/2007 8.979601 # 3 7/12/2007 8.981383 # 4 7/13/2007 8.970571 # 5 7/16/2007 8.968542 # 6 7/17/2007 8.999584 # 7 7/18/2007 9.032026 # 8 7/4/2007 8.917645 # 9 7/5/2007 8.950605 # 10 7/6/2007 8.970669 # 11 7/9/2007 8.946661 I'm not suggesting that the linear model is the best fit, as I didn't spend time on checking that. But, you can use a quadratic model instead of a linear, which might give you a better estimation. In case you want to check the model output and get info about the models you built and used for each Date you can do this: library(dplyr) library(tidyr) library(broom) dt %>% gather(time, value, -Date) %>% # reshape dataset filter(!is.na(value)) %>% # ignore NA values mutate(time = as.numeric(gsub("X","",time))) %>% # get rid of the X created by importing data group_by(Date) %>% # for each date do({model = lm(value~time, data=.) # build a linear model tidy(model)}) # check model output # Source: local data frame [22 x 6] # Groups: Date [11] # # Date term estimate std.error statistic p.value # (fctr) (chr) (dbl) (dbl) (dbl) (dbl) # 1 7/10/2007 (Intercept) 9.29495818 0.19895389 46.719158 8.485928e-08 # 2 7/10/2007 time -0.07480530 0.01875160 -3.989275 1.043399e-02 # 3 7/11/2007 (Intercept) 9.34942937 0.19823019 47.164509 8.093526e-08 # 4 7/11/2007 time -0.07396561 0.01868339 -3.958897 1.075469e-02 # 5 7/12/2007 (Intercept) 9.35001022 0.20037595 46.662337 8.537618e-08 # 6 7/12/2007 time -0.07372537 0.01888563 -3.903781 1.136592e-02 # 7 7/13/2007 (Intercept) 9.33730855 0.19974786 46.745476 8.462114e-08 # 8 7/13/2007 time -0.07334758 0.01882643 -3.895989 1.145551e-02 # 9 7/16/2007 (Intercept) 9.33045446 0.19856561 46.989276 8.245272e-08 # 10 7/16/2007 time -0.07238243 0.01871501 -3.867615 1.178869e-02 # .. ... ... ... ... ... ...
tm package: Output of findAssocs() in a matrix instead of a list in R
Consider the following list: library(tm) data("crude") tdm <- TermDocumentMatrix(crude) a <- findAssocs(tdm, c("oil", "opec", "xyz"), c(0.7, 0.75, 0.1)) How do I manage to have a data frame with all terms associated with these 3 words in the columns and showing: The corresponding correlation coefficient (if it exists) NA if it does not exists for this word (for example the couple (oil, they) would show NA)
Here's a solution using reshape2 to help reshape the data library(reshape2) aa<-do.call(rbind, Map(function(d, n) cbind.data.frame( xterm=if (length(d)>0) names(d) else NA, cor=if(length(d)>0) d else NA, term=n), a, names(a)) ) dcast(aa, term~xterm, value.var="cor")
Or you could use dplyr and tidyr library(dplyr) library('devtools') install_github('hadley/tidyr') library(tidyr) a1 <- unnest(lapply(a, function(x) data.frame(xterm=names(x), cor=x, stringsAsFactors=FALSE)), term) a1 %>% spread(xterm, cor) #here it removed terms without any `cor` for the `xterm` # term 15.8 ability above agreement analysts buyers clearly emergency fixed #1 oil 0.87 NA 0.76 0.71 0.79 0.70 0.8 0.75 0.73 #2 opec 0.85 0.8 0.82 0.76 0.85 0.83 NA 0.87 NA # late market meeting prices prices. said that they trying who winter #1 0.8 0.75 0.77 0.72 NA 0.78 0.73 NA 0.8 0.8 0.8 #2 NA NA 0.88 NA 0.79 0.82 NA 0.8 NA NA NA Update aNew <- sapply(tdm$dimnames$Terms, function(i) findAssocs(tdm, i, corlimit=0.95)) aNew2 <- aNew[!!sapply(aNew, function(x) length(dim(x)))] aNew3 <- unnest(lapply(aNew2, function(x) data.frame(xterm=rownames(x), cor=x[,1], stringsAsFactors=FALSE)[1:3,]), term) res <- aNew3 %>% spread(xterm, cor) dim(res) #[1] 1021 160 res[1:3,1:5] # term ... 100,000 10.8 1.1 #1 ... NA NA NA NA #2 100,000 NA NA NA 1 #3 10.8 NA NA NA NA
Dividing specific values between two arrays
I am a novice R user trying to work with a data set of 40,000 rows and 300 columns. I have found a solution for what I would like to do, however my machine takes over an hour to run my code and I feel like an expert could help me with a quicker solution (as I can do this in excel in half the time). I will post my solution at the end. What I would like to do is the following: Compute the average value for each column NY1 to NYn based on the value of the YYYYMMbucket column. Divide original value by the its average YYYYMMbucket value. Here is sample of my original data set: YYYYMMbucket NY1 NY2 NY3 NY4 1 200701.3 0.309 NA 20.719 16260 2 200701.3 0.265 NA 19.482 15138 3 200701.3 0.239 NA 19.168 14418 4 200701.3 0.225 NA 19.106 14046 5 200701.3 0.223 NA 19.211 14040 6 200701.3 0.234 NA 19.621 14718 7 200701.3 0.270 NA 20.522 15780 8 200701.3 0.298 NA 22.284 16662 9 200701.2 0.330 NA 23.420 16914 10 200701.2 0.354 NA 23.805 17310 11 200701.2 0.388 NA 24.095 17448 12 200701.2 0.367 NA 23.954 17640 13 200701.2 0.355 NA 23.255 17748 14 200701.2 0.346 NA 22.731 17544 15 200701.2 0.347 NA 22.445 17472 16 200701.2 0.366 NA 21.945 17634 17 200701.2 0.408 NA 22.683 18876 18 200701.2 0.478 NA 23.189 21498 19 200701.2 0.550 NA 23.785 22284 20 200701.2 0.601 NA 24.515 22368 This is what my averages look like: YYYYMMbucket NY1M NY2M 1 200701.1 0.4424574 NA 2 200701.2 0.4530000 NA 3 200701.3 0.2936935 NA 4 200702.1 0.4624063 NA 5 200702.2 0.4785937 NA 6 200702.3 0.3091161 NA 7 200703.1 0.4159687 NA 8 200703.2 0.4491875 NA 9 200703.3 0.2840081 NA 10 200704.1 0.4279137 NA How I would like my final output to look: NY1avgs NY2avgs NY3avgs 1 1.052117 NA 0.7560868 2 0.9023011 NA 0.7109456 3 0.8137734 NA 0.699487 4 0.7661047 NA 0.6972245 5 0.7592949 NA 0.7010562 6 0.7967489 NA 0.7160181 7 0.9193256 NA 0.7488978 8 1.014663 NA 0.8131974 9 0.7284768 NA 0.857904 Here's how I did it: First I used "plyr" to compute my averages, simple enough: test <- ddply(prf.delete2b,. (YYYYMMbucket), summarise, NY1M = mean(NY1), NY2M = mean(NY2) ... ...)) Then used a series of the following: x <- c(1:40893) lookv <- function(x,ltab,rcol=2) ltab[max(which(ltab[,1]<=x)),rcol] NY1Fun <- function(x) (prf.delete2b$NY1[x] / lookv((prf.delete2b$YYYYMMbucket[x]),test,2)) NY2Fun <- function(x) (prf.delete2b$NY2[x] / lookv((prf.delete2b$YYYYMMbucket[x]),test,3)) NY1Avgs <- lapply(x, NY1Fun) NY2Avgs <- lapply(x, NY2Fun) I also tried a variant of the above by saying: NY1Fun <- function(x) (prf.delete2b$NY1[x] / subset(test, YYYYMMbucket == prf.delete2b$YYYYMMbucket[x], select =c(NY1M))) lapply(x, NY1Fun) Each variant of NYnFun takes a good 20 seconds to run so doing this 300 times takes much too long. Can anyone recommend any alternative to what I posted or point out any novice mistakes I've made?
Here is the customary data.table approach, which works pretty fast. # CREATE DUMMY DATA N = 1000 mydf = data.frame( bucket = sample(letters, N, replace = T), NY1 = runif(N), NY2 = runif(N), NY3 = runif(N), NY4 = runif(N) ) # SCALE COLUMNS BY AVG library(data.table) scale_x = function(x) x/ave(x) mydt = data.table(mydf) ans = mydt[,lapply(.SD, scale_x), by = 'bucket']
How about: test2 <- merge(prfdelete2b,test,all.x=TRUE) test2[2:ncol(prefdelete2b)]/test2[(ncol(prefdelete2b)+1):ncol(test2)]
In this case, I would use ave instead of ddply because ave returns a vector the same length as its input. ave only accepts a vector, so you need to use lapply to loop over the columns of your data.frame. myFun <- function(x, groupVar) { x / ave(x, groupVar, FUN=function(y) mean(y, na.rm=TRUE)) } relToMeans <- data.frame(prf.delete2b[1], lapply(prf.delete2b[-1], myFun, groupVar=prf.delete2b[1]))