Related
I have a large data frame in R with over 200 participants who have answered 152 questions. Now, I want to insert a column based on a conditional query after each "Answer" column. As an example, I have the following data frame:
data <- data.frame(Participants = 1:5,
Answer1 = c(4, 6, 2, 2, 3),
Answer2 = c(5, 1, 3, 5, 4))
I now want to insert a new conditional column of "Confidence" after each "Answer" column. For the column of "Answer1", the query would look like this:
data$Confidence1 <- ifelse(data$Answer1 == 1 | data$Answer1 == 6, 2, ifelse(data$Answer1 == 2 | data$Answer1 == 5, 1, 0))
In the end, I want the data frame to look like this:
data <- data.frame(Participants = 1:5,
Answer1 = c(4, 6, 2, 2, 3),
Confidence1 = c(0, 2, 1, 1, 0),
Answer2 = c(5, 1, 3, 5, 4),
Confidence2 = c(1, 2, 0, 1, 0))
Does anyone have an idea on how to achieve this for all "Answer" columns at once? Thanks!
Here is one solution using the modify() function from the purrr package the response by #r2evans from merge two data table into one, with alternating columns in R
library(purrr)
# create data, adding one more answer to your example
data <- data.frame(Participants = 1:5,
Answer1 = c(4, 6, 2, 2, 3),
Answer2 = c(5, 1, 3, 5, 4),
Answer3 = c(3, 2, 3, 4, 5))
# make a new df containing only the answer columns from data
answers <- data[,2:4]
# make confidence df and give it correct names
conf <- modify(answers, function(x) ifelse(x == 1 | x == 6, 2, ifelse(x == 2 | x == 5, 1, 0)))
names(conf) <- paste0("Confidence",1:ncol(answers))
# set order
neworder <- order(c(2*(seq_along(answers) - 1) + 1,
2*seq_along(conf)))
# put it together
cbind(Participants = data[,1], cbind(answers, conf)[,neworder])
I have two large data tables with the coordinates of different sequences. For example:
library(data.table)
dt1 <- data.table(cat = c(rep("A", 2), rep("B", 2)),
start = c(1, 4, 2, 15),
end = c(6, 9, 5, 20))
dt2 <- data.table(cat = c(rep("A", 2), rep("B", 2)),
start = c(2, 1, 10, 17),
end = c(7, 3, 12, 20))
I need to create a data table of the coordinates for the overlapping sequences (ie the integers that occur in the sequences given in both data tables, for each category). I can currently do this using a for loop. For example:
seq2 <- Vectorize(seq.default, vectorize.args = c("from", "to"))
out_list <- list()
for(i in 1:length(unique(dt1$cat))){
sub1 <- dt1[cat == unique(dt1$cat)[i]]
sub2 <- dt2[cat == unique(dt1$cat)[i]]
vec1 <- unique(unlist(c(seq2(from = sub1$start, to = sub1$end))))
vec2 <- unique(unlist(c(seq2(from = sub2$start, to = sub2$end))))
vec <- Reduce(intersect, list(vec1, vec2))
vec_dt <- data.table(V1 = vec)
output <- vec_dt[order(V1),
.(start = min(V1),
end = max(V1)),
by = .(grp = rleid(c(0, cumsum(diff(V1) > 1))))
]
output$grp <- NULL
output$cat <- unique(dt1$cat)[i]
out_list[[i]] <- output
print(i)
}
output_dt <- do.call("rbind", out_list)
However, the data sets I need to apply this to are very large (both in the number of rows and the size of the vectors). Is anyone able to suggest a way to improve performance?
Thanks
You could (a) convert your start/end variables to a sequence, (b) do an inner join, (c) convert back to start/end.
library(data.table)
dt1 <- data.table(cat = c(rep("A", 2), rep("B", 2)),
start = c(1, 4, 2, 15),
end = c(6, 9, 5, 20))
dt2 <- data.table(cat = c(rep("A", 2), rep("B", 2)),
start = c(2, 1, 10, 17),
end = c(7, 3, 12, 20))
# convert to sequence
dt1 = dt1[, .(sequence = start:end), by=.(cat, 1:nrow(dt1))][
, nrow := NULL]
dt2 = dt2[, .(sequence = start:end), by=.(cat, 1:nrow(dt2))][
, nrow := NULL]
# inner join + unique
overlap = merge(dt1, dt2)
overlap = unique(overlap)
# convert to start/end
overlap = overlap[, .(start=min(sequence), end=max(sequence)), by=.(cat)]
# result
overlap
#> cat start end
#> 1: A 1 7
#> 2: B 17 20
I have a dataset with hundreds of rows structured like this
User Date Value1 Value2
A 2012-01-01 4 3
A 2012-01-02 5 7
A 2012-01-03 6 1
A 2012-01-04 7 4
B 2012-01-01 2 4
B 2012-01-02 3 2
B 2012-01-03 4 9
B 2012-01-04 5 3
As the panel data has two indices (User=k, Date=t), I struggle to run a regression on R where the dependent variable (Value 1) is lagged only on the time index. the regression should be performed as follows:
Value1(k,t+1) ~ Value2(k,t)
or
Value1(k,t) ~ Value2(k,t-1)
Any suggestions?
For every user, you can do:
> df <- data.frame(User = c(rep("A", 4), rep("B", 4)),
+ Date = rep(seq.Date(as.Date("2012-01-01"), as.Date("2012-01-04"), by = "day"), 2),
+ Value1 = c(4, 5, 6, 7, 2, 3, 4, 5),
+ Value2 = c(3, 7, 1, 4, 4, 2, 9, 3))
>
> df_A <- df[df$User == "A", c("Value1", "Value2")]
> ts_A <- ts(df_A, start = c(2012, 1, 1), frequency = 365)
> ts_A <- ts.intersect(ts_A, lag(ts_A, -1))
> colnames(ts_A) <- c("Value1", "Value2", "Value1_t_1", "Value2_t_1")
>
> lm(Value1 ~ Value2_t_1, ts_A)
Call:
lm(formula = Value1 ~ Value2_t_1, data = ts_A)
Coefficients:
(Intercept) Value2_t_1
6.3929 -0.1071
>
Hope it helps.
Here's a solution using the dplyr package, you may notice in the code below I explicitly reference the lag function from dplyr as opposed to base R (stats). This is because the lag function from dplyr does not require a time series input.
I would also note that the two formulas you list may produce different regression results as you will be running them over different sets of data i.e.
Value1(k,t+1) ~ Value2(k,t) : run on the time period of 1-01-2012 to 1-03-2012
Value1(k,t) ~ Value2(k,t-1) : run on the time period of 1-02-2012 to 1-04-2012
library("tidyverse")
df <- data.frame(User = c(rep("A", 4), rep("B", 4)),
Date = rep(seq.Date(as.Date("2012-01-01"), as.Date("2012-01-04"), by = "day"), 2),
Value1 = c(4, 5, 6, 7, 2, 3, 4, 5),
Value2 = c(3, 7, 1, 4, 4, 2, 9, 3))
df2 <- df %>% arrange(User,Date) %>%
group_by(User) %>%
mutate(lag_v2 = dplyr::lag(Value2),
lead_v1 = dplyr::lead(Value1))
df3<-df2[!is.na(df2$lag_v2),]
df4<-df2[!is.na(df2$lead_v1),]
summary(lm(Value1~lag_v2,data=df3))
summary(lm(lead_v1~Value2,data=df4))
I have three data frames which I have combined in a list
d1 <- data.frame(y1 = c(1, 2, 3), y2 = c(4, 5, 6))
d2 <- data.frame(y1 = c(3, 2, 1), y2 = c(6, 5, 4))
d3 <- data.frame(y1 = c(5, 7, 8),y2 = c(6, 4, 2))
my.list <- list(d1, d2,d3)
I want to extract the first row of each element in the list, bind them row wise and save as csv file.
For example, in above example, I want to extract first row from d1, d2 and d3
row1.d1 <- c(1,4)
row1.d2 <- c(3,6)
row1.d3 <- c(5,6)
and bind them together
dat <- rbind(row1.d1,row1.d2,row1.d3)
dat
row1.d1 1 4
row1.d2 3 6
row1.d3 5 6
and repeat it for all rows.
I found a way to do this if I have a list of vectors,
A=list()
A[[1]]=c(1,2)
A[[2]]=c(3,4)
A[[3]]=c(5,6)
sapply(A,'[[',1)
But for dataframes, I am not sure how to go about it.
Another way would be the following. You go through each data frame in my.list and get the first row with lapply(). Then you bind the result.
do.call(rbind, (lapply(my.list, function(x) x[1,])))
# y1 y2
#1 1 4
#2 3 6
#3 5 6
A tidyverse/FP solution is below. I added id and df to retain information about the row number and source, respectively.
# your data
d1 <- data.frame(y1 = c(1, 2, 3), y2 = c(4, 5, 6))
d2 <- data.frame(y1 = c(3, 2, 1), y2 = c(6, 5, 4))
d3 <- data.frame(y1 = c(5, 7, 8),y2 = c(6, 4, 2))
my.list <- list(d1, d2,d3)
# tidyverse/ FP solution
library(dplyr)
library(purrr)
library(readr)
map_df(.x = seq(1:3),
.f = function(x) bind_rows(my.list[x]) %>%
mutate(id = row_number(),
df = x)) %>%
arrange(id) %>%
#select(-id, -df) %>% # uncomment if you want to lose row num and source
write_csv(path = 'yourfile.csv')
We need to use , after the 1
t(sapply(my.list, `[`, 1, ))
I have a data frame where some columns have the same data, but different column names. I would like to remove duplicated columns, but merge the column names. An example, where test1 and test4 columns are duplicates:
df
test1 test2 test3 test4
1 1 1 0 1
2 2 2 2 2
3 3 4 4 3
4 4 4 4 4
5 5 5 5 5
6 6 6 6 6
and I would like the result to be something like this:
df
test1+test4 test2 test3
1 1 1 0
2 2 2 2
3 3 4 4
4 4 4 4
5 5 5 5
6 6 6 6
Here is the data:
structure(list(test1 = c(1, 2, 3, 4, 5, 6), test2 = c(1, 2, 4,
4, 5, 6), test3 = c(0, 2, 4, 4, 5, 6), test4 = c(1, 2, 3, 4,
5, 6)), .Names = c("test1", "test2", "test3", "test4"), row.names = c(NA,
-6L), class = "data.frame")
Please note that I do not simply want to remove duplicated columns. I also want to merge the column names of the duplicated columns, after the duplicates are removed.
I could do it manually for the simple table I posted, but I want to use this on large datasets, where I don't know in advance what columns are identical. I do not what to remove and rename columns manually, since I might have over 50 duplicated columns.
Ok, improving on the above answer using the idea from here. Save the duplicate and non-duplicate columns into data frames. Check to see if the non-duplicates match any duplicates, and if so concatenate their columns names. So this will now work if you have more than two duplicate columns.
Editted: Changed summary to digest. This helps with character data.
df <- structure(list(test1 = c(1, 2, 3, 4, 5, 6), test2 = c(1, 2, 4,
4, 5, 6), test3 = c(0, 2, 4, 4, 5, 6), test4 = c(1, 2, 3, 4,
5, 6)), .Names = c("test1", "test2", "test3", "test4"), row.names = c(NA,
-6L), class = "data.frame")
library(digest)
nondups <- df[!duplicated(lapply(df, digest))]
dups <- df[duplicated(lapply(df, digest))]
for(i in 1:ncol(nondups)){
for(j in 1:ncol(dups)){
if(FALSE %in% paste0(nondups[,i] == dups[,j])) NULL
else names(nondups)[i] <- paste(names(nondups[i]), names(dups[j]), sep = "+")
}
}
nondups
Example 2, as a function.
Editted: Changed summary to digest and return non-duplicated and duplicated data frames.
age <- 18:29
height <- c(76.1,77,78.1,78.2,78.8,79.7,79.9,81.1,81.2,81.8,82.8,83.5)
gender <- c("M","F","M","M","F","F","M","M","F","M","F","M")
testframe <- data.frame(age=age,height=height,height2=height,gender=gender,gender2=gender, gender3 = gender)
dupcols <- function(df = testframe){
nondups <- df[!duplicated(lapply(df, digest))]
dups <- df[duplicated(lapply(df, digest))]
for(i in 1:ncol(nondups)){
for(j in 1:ncol(dups)){
if(FALSE %in% paste0(nondups[,i] == dups[,j])) NULL
else names(nondups)[i] <- paste(names(nondups[i]), names(dups[j]), sep = "+")
}
}
return(list(df1 = nondups, df2 = dups))
}
dupcols(df = testframe)
Editted: This section is new.
Example 3: On a large data frame
#Creating a 1500 column by 15000 row data frame
dat <- do.call(data.frame, replicate(1500, rep(FALSE, 15000), simplify=FALSE))
names(dat) <- 1:1500
#Fill the data frame with LETTERS across the rows
#This part may take a while. Took my PC about 23 minutes.
start <- Sys.time()
fill <- rep(LETTERS, times = ceiling((15000*1500)/26))
j <- 0
for(i in 1:nrow(dat)){
dat[i,] <- fill[(1+j):(1500+j)]
j <- j + 1500
}
difftime(Sys.time(), start, "mins")
#Run the function on the created data set
#This took about 4 minutes to complete on my PC.
start <- Sys.time()
result <- dupcols(df = dat)
difftime(Sys.time(), start, "mins")
names(result$df1)
ncol(result$df1)
ncol(result$df2)
It's not completely automated, but the output of the loop will identify pairs of duplicate columns. You'll then have to remove one of the duplicate columns and then re-name based on what columns were duplicates.
df <- structure(list(test1 = c(1, 2, 3, 4, 5, 6), test2 = c(1, 2, 4,
4, 5, 6), test3 = c(0, 2, 4, 4, 5, 6), test4 = c(1, 2, 3, 4,
5, 6)), .Names = c("test1", "test2", "test3", "test4"), row.names = c(NA,
-6L), class = "data.frame")
for(i in 1:(ncol(df)-1)){
for(j in 2:ncol(df)){
if(i == j) NULL
else if(FALSE %in% paste0(df[,i] == df[,j])) NULL
else print(paste(i, j, sep = " + "))
}
}
new <- df[,-4]
names(new)[1] <- paste(names(df[1]), names(df[4]), sep = "+")
new