My Data looks like
df <- data.frame(user_id=c('13','15'),
answer_id = c('{"row[0][0]":"A","row[0][1]":"B","row[0][2]":"C","row[0][3]":"D","row[1][0]":"A1","row[1][1]":"B1","row[1][2]":"C1","row[1][3]":"D1"}', '{"row[0][0]":"W","row[0][1]":"X","row[0][2]":"Y","row[0][3]":"Z","row[1][0]":"W1","row[1][1]":"X1","row[1][2]":"Y1","row[1][3]":"Z1"}
'))
Desired data view
user_id answer_id1 answer_id2 answer_id3 answer_id4
13 A B C D
13 A1 B1 C1 D1
15 W X Y Z
15 W1 X1 Y1 Z1
i'm new with R and hope to get solution soon as i do always
may not be the best solution but this can get you from your sample input to your desired output using stringr, purrr, & tidyr. See regex101 for an explanation of the regex used in the stringr::str_match_all() call.
df <- data.frame(user_id=c('13','15'),
answer_id = c('{"row[0][0]":"A","row[0][1]":"B","row[0][2]":"C","row[0][3]":"D","row[1][0]":"A1","row[1][1]":"B1","row[1][2]":"C1","row[1][3]":"D1"}', '{"row[0][0]":"W","row[0][1]":"X","row[0][2]":"Y","row[0][3]":"Z","row[1][0]":"W1","row[1][1]":"X1","row[1][2]":"Y1","row[1][3]":"Z1"}'),
stringsAsFactors=F)
#use regex to extract row ids and answers
regex_matches <- stringr::str_match_all(df$answer_id, '\\"row\\[(\\d+)\\]\\[(\\d+)\\]\\":\\"([^\\"]*)\\"')
#add user id to each result
answers_by_user <- purrr::map2(df$user_id, regex_matches, ~cbind(.x, .y[,-1]))
#combine list of matrices and convert to df
answers_df <- data.frame(do.call(rbind, answers_by_user))
#add meaningful names
names(answers_df) <- c("user_id", "row_1", "row_2", "value")
#convert to wide
spread_row_1 <- tidyr::spread(answers_df, row_1, value)
final_df <- tidyr::spread(answers_df, row_2, value)
#remove row column
final_df$row_1 <- NULL
#clean up names
names(final_df) <- c("user_id", "answer_id1", "answer_id2", "answer_id3", "answer_id4")
final_df
#output
user_id answer_id1 answer_id2 answer_id3 answer_id4
1 13 A B C D
2 13 A1 B1 C1 D1
3 15 W X Y Z
4 15 W1 X1 Y1 Z1
Column 2 looks like JSON, so you could do something like this to get it into a form that you can do something with...
library(rjson)
df2 <- lapply(1:nrow(df),function(i)
data.frame(user=df[i,1],
answer=unlist(fromJSON(as.character(df[i,2]))),stringsAsFactors = FALSE))
df2 <- do.call(rbind,df2)
df2[,"r1"] <- gsub(".+\\[(\\d)]\\[(\\d)].*","\\1",rownames(df2))
df2[,"r2"] <- gsub(".+\\[(\\d)]\\[(\\d)].*","\\2",rownames(df2))
df2
user answer r1 r2
row[0][0] 13 A 0 0
row[0][1] 13 B 0 1
row[0][2] 13 C 0 2
row[0][3] 13 D 0 3
row[1][0] 13 A1 1 0
row[1][1] 13 B1 1 1
row[1][2] 13 C1 1 2
row[1][3] 13 D1 1 3
row[0][0]1 15 W 0 0
row[0][1]1 15 X 0 1
row[0][2]1 15 Y 0 2
row[0][3]1 15 Z 0 3
row[1][0]1 15 W1 1 0
row[1][1]1 15 X1 1 1
row[1][2]1 15 Y1 1 2
row[1][3]1 15 Z1 1 3
Related
I am struggling to solve the captioned problem.
My dataframe is like:
X1 X2 X3 X4 X5
1 1 2 3 4 5
2 6 7 8 9 10
3 11 12 13 14 15
What I am trying to do is randomly selecting 3 elements from the third and fourth column and replace their values by 0. So the manipulated dataframe could be like
X1 X2 X3 X4 X5
1 1 2 3 4 5
2 6 7 0 0 10
3 11 12 13 0 15
I saw from here Random number selection from a data-frame that it could be easier if I convert the data frame into matrix, so I tried
mat <- data.frame(rbind(rep(1:5, 1), rep(6:10, 1), rep(11:15, 1)))
mat_matrix <- as.matrix(mat)
mat_matrix[sample(mat_matrix[, 3:4], 3)] <- 0
But it just randomly picked 3 elements across all columns and rows in the matrix and turned them into 0.
Can anyone help me out?
You can use slice.index and sample from that.
mat_matrix[sample(slice.index(mat_matrix, 1:2)[,3:4], 3)] <- 0
Nothing wrong with a for loop in this case. Perhaps like this:
mat <- data.frame(rbind(rep(1:5, 1), rep(6:10, 1), rep(11:15, 1)))
cols <- c(3,4)
n <- nrow(mat)*length(cols)
v <- sample( x=1:n, size=3 )
m <- matrix(FALSE, ncol=length(cols), nrow=nrow(mat))
m[v] <- TRUE
for( i in seq_along(cols) ) {
mat[ m[,i], cols[i] ] <- 0
}
Just create a two column "index matrix" that you sample on and use to replace back into your data.
Here is one way using replace
cols <- c("X3", "X4")
N <- 3
df[cols] <- replace(as.matrix(df[cols]), sample(length(unlist(df[cols])), N), 0)
such that
> df
X1 X2 X3 X4 X5
1 1 2 3 0 5
2 6 7 8 0 10
3 11 12 0 14 15
This is my sample data
> data.frame
a b c d
W_1_N NA NA NA NA
W_1_E 2 2 2 4
W_1_C 4 2 2 4
W_1_D NA NA NA NA
First I had to combine elements from matrix to get pairs of column names of them, where one of element is 4 and another is 2 in the same row.
In a result it looks like this
W_1_E.1 d a
W_1_E.2 d b
W_1_E.3 d c
W_1_C.1 a b
W_1_C.2 a c
W_1_C.3 d b
W_1_C.4 d c
I wanted only pairs where one element is 4 and other is 2 in the same row. W_1_N and W_1_D have only NA so was ommited. W_1_E appears in 3 rows because there are 3 pairs of (4,2) in row in sample data.W_1_C has 4 pairs.
This is code:
lst=data.frame(df) %>%
rownames_to_column("rn") %>%
drop_na() %>%
gather(key, value, -rn) %>%
group_by(rn, value) %>%
summarise(l = list(unique(key))) %>%
split(.$rn)
pair=do.call("rbind", lapply(lst, function(x) expand.grid(x$l[[1]],
x$l[[2]])))
It works perfectly, but now I have second data.frame:
a b c d
W_1_N 0 1 1 1
W_1_E 1 1 0 0
W_1_C 1 1 1 0
W_1_D 1 0 1 1
Here is my problem, I want to get only this pairs where value of both elements of pair is 1 in second data.frame. For example first pair of my result W_1_E.1 d a should be eliminated because d has value 0 in W_1_E row in second data.frame.
The output should be:
W_1_C.1 a b
W_1_C.2 a c
d has value 0 in W_1_E row, so all rows with W_1_E in my result data.frame were eliminated (all pars were with d). The last two rows were eliminated because d is also 0 in W_1_C row in second dataframe.
Thanks for your help
How's this?
x <- "N a b c d
W_1_N NA NA NA NA
W_1_E 2 2 2 4
W_1_C 4 2 2 4
W_1_D NA NA NA NA "
x1 <- read.table(text = x, header = TRUE)
x <- "N a b c d
W_1_N 0 1 1 1
W_1_E 1 1 0 0
W_1_C 1 1 1 0
W_1_D 1 0 1 1 "
x2 <- read.table(text = x, header = TRUE)
df <- merge(x1, x2, by="N")
df$a <- ifelse(df$a.y == 0,NA,df$a.x)
df$b <- ifelse(df$b.y == 0,NA,df$b.x)
df$c <- ifelse(df$c.y == 0,NA,df$c.x)
df$d <- ifelse(df$d.y == 0,NA,df$d.x)
df <- df[ , c(1,10:13)]
library(tidyr)
df_all <- df %>%
gather(key = key1, value, 2:5)
df2 <- df_all[!is.na(df_all$value) & df_all$value == 2,]
df4 <- df_all[!is.na(df_all$value) & df_all$value == 4,]
merge(df2[,1:2], df4[1:2], by = "N", all.x = FALSE, all.y = FALSE)
I want to merge two data frames, skipping rows based on a specific column value, but still keep the skipped rows in the final merged data frame. I can manage the first part (skipping), but not the second.
Here are the data frames:
# Data frame 1 values
ids1 <- c(1:3)
x1 <- c(100, 101, 102)
doNotMerge <- c(1, 0, 0)
# Data frame 2 values
ids2 <- c(1:3)
x2 <- c(200, 201, 202)
# Creating the data frames
df1 <- as.data.frame(matrix(c(ids1, x1, doNotMerge),
nrow = 3,
ncol = 3,
dimnames = list(c(),c("ID", "X1", "DoNotMerge"))))
df2 <- as.data.frame(matrix(c(ids2, x2),
nrow = 3,
ncol = 2,
dimnames = list(c(),c("ID", "X2"))))
# df1 contents:
# ID X1 DoNotMerge
# 1 1 100 1
# 2 2 101 0
# 3 3 102 0
# df2 contents:
# ID X2
# 1 1 200
# 2 2 201
# 3 3 202
I used merge:
merged <- merge(df1[df1$DoNotMerge != 1,], df2, by = "ID", all = T)
# merged contents:
# ID X1 DoNotMerge X2
# 1 1 NA NA 200
# 2 2 101 0 201
# 3 3 102 0 202
The skipping part I was able to do, but what I actually want is to keep the df1 row where DoNotMerge == 1, like so:
# ID X1 DoNotMerge X2
# 1 1 NA NA 200
# 2 1 100 1 NA
# 3 2 101 0 201
# 4 3 102 0 202
Can anyone please help? Thanks.
Update: I actually found the solution while writing the question (ran into this question), so figured I'd post it in case someone else encounters this problem:
require(plyr)
rbind.fill(merged, df1[df1$DoNotMerge == 1,])
# Result:
# ID X1 DoNotMerge X2
# 1 1 NA NA 200
# 2 2 101 0 201
# 3 3 102 0 202
# 4 1 100 1 NA
What I want to do is multiply all the values in column 1 of a data.frame by the first element in a vector, then multiply all the values in column 2 by the 2nd element in the vector, etc...
c1 <- c(1,2,3)
c2 <- c(4,5,6)
c3 <- c(7,8,9)
d1 <- data.frame(c1,c2,c3)
c1 c2 c3
1 1 4 7
2 2 5 8
3 3 6 9
v1 <- c(1,2,3)
So the result is this:
c1 c2 c3
1 1 8 21
2 2 10 24
3 3 12 27
I can do this one column at a time but what if I have 100 columns? I want to be able to do this programmatically.
Or simply diagonalize the vector, so that each row entry is multiplied by the corresponding element in v1:
c1 <- c(1,2,3)
c2 <- c(4,5,6)
c3 <- c(7,8,9)
d1 <- as.matrix(cbind(c1,c2,c3))
v1 <- c(1,2,3)
d1%*%diag(v1)
[,1] [,2] [,3]
[1,] 1 8 21
[2,] 2 10 24
[3,] 3 12 27
Transposing the dataframe works.
c1 <- c(1,2,3)
c2 <- c(4,5,6)
c3 <- c(7,8,9)
d1 <- data.frame(c1,c2,c3)
v1 <- c(1,2,3)
t(t(d1)*v1)
# c1 c2 c3
#[1,] 1 8 21
#[2,] 2 10 24
#[3,] 3 12 27
EDIT: If all columns are not numeric, you can do the following
c1 <- c(1,2,3)
c2 <- c(4,5,6)
c3 <- c(7,8,9)
d1 <- data.frame(c1,c2,c3)
# Adding a column of characters for demonstration
d1$c4 <- c("rr", "t", "s")
v1 <- c(1,2,3)
#Choosing only numeric columns
index <- which(sapply(d1, is.numeric) == TRUE)
d1_mat <- as.matrix(d1[,index])
d1[,index] <- t(t(d1_mat)*v1)
d1
# c1 c2 c3 c4
#1 1 8 21 rr
#2 2 10 24 t
#3 3 12 27 s
We can also replicate the vector to make the lengths equal and then multiply
d1*v1[col(d1)]
# c1 c2 c3
#1 1 8 21
#2 2 10 24
#3 3 12 27
Or use sweep
sweep(d1, 2, v1, FUN="*")
Or with mapply to multiply the corresponding columns of 'data.frame' and elements of 'vector'
mapply(`*`, d1, v1)
Shown as below:
df <- data.frame(X1 = rep(letters[1:3],3),
X2 = 1:9,
X3 = sample(1:50,9))
df
ind<- grep("a|c", df$X1)
library(data.table)
df_ac <- df[ind,]
df_b <- df[!ind,]
df_ac is created using the regular grep command. If I want to use the grep the reverse way: to select all observations with X1 == 'b'.
I know I can do this by:
ind2<- grep("a|c", df$X1, invert = T)
df_b <-df[ind2,]
But, in my original script, why does the command df_b <-df[!ind,] return a data frame with zero observation?
Anyone can explain to me why my logic here is wrong? Is there any other way to select observations in a data.frame by using the grep reversely without specifying invert = T? Thank you!
You may be more interested in grepl instead of grep:
ind<- grepl("a|c", df$X1)
df[ind,]
# X1 X2 X3
# 1 a 1 16
# 3 c 3 38
# 4 a 4 10
# 6 c 6 18
# 7 a 7 33
# 9 c 9 49
df[!ind,]
# X1 X2 X3
# 2 b 2 5
# 5 b 5 14
# 8 b 8 50
Alternatively, go ahead an make use of "data.table" and try out %in% to see what else might work for you. Notice the difference in the syntax.
ind2 <- c("a", "c")
library(data.table)
setDT(df)
df[X1 %in% ind2]
# X1 X2 X3
# 1: a 1 16
# 2: c 3 38
# 3: a 4 10
# 4: c 6 18
# 5: a 7 33
# 6: c 9 49
df[!X1 %in% ind2]
# X1 X2 X3
# 1: b 2 5
# 2: b 5 14
# 3: b 8 50