Related
I am struggling to solve the captioned problem.
My dataframe is like:
X1 X2 X3 X4 X5
1 1 2 3 4 5
2 6 7 8 9 10
3 11 12 13 14 15
What I am trying to do is randomly selecting 3 elements from the third and fourth column and replace their values by 0. So the manipulated dataframe could be like
X1 X2 X3 X4 X5
1 1 2 3 4 5
2 6 7 0 0 10
3 11 12 13 0 15
I saw from here Random number selection from a data-frame that it could be easier if I convert the data frame into matrix, so I tried
mat <- data.frame(rbind(rep(1:5, 1), rep(6:10, 1), rep(11:15, 1)))
mat_matrix <- as.matrix(mat)
mat_matrix[sample(mat_matrix[, 3:4], 3)] <- 0
But it just randomly picked 3 elements across all columns and rows in the matrix and turned them into 0.
Can anyone help me out?
You can use slice.index and sample from that.
mat_matrix[sample(slice.index(mat_matrix, 1:2)[,3:4], 3)] <- 0
Nothing wrong with a for loop in this case. Perhaps like this:
mat <- data.frame(rbind(rep(1:5, 1), rep(6:10, 1), rep(11:15, 1)))
cols <- c(3,4)
n <- nrow(mat)*length(cols)
v <- sample( x=1:n, size=3 )
m <- matrix(FALSE, ncol=length(cols), nrow=nrow(mat))
m[v] <- TRUE
for( i in seq_along(cols) ) {
mat[ m[,i], cols[i] ] <- 0
}
Just create a two column "index matrix" that you sample on and use to replace back into your data.
Here is one way using replace
cols <- c("X3", "X4")
N <- 3
df[cols] <- replace(as.matrix(df[cols]), sample(length(unlist(df[cols])), N), 0)
such that
> df
X1 X2 X3 X4 X5
1 1 2 3 0 5
2 6 7 8 0 10
3 11 12 0 14 15
I am currently trying to cycle through a dataframe of integers and characters and change one value of each row, conditionally. For all rows that do not meet the conditions I would just like to add them back into a new dataframe filled with the modified rows.
I've done this before with no trouble, but I feel as though I have been staring at this too long without any enlightenment.
a<-data.frame(cbind(1,'a',2,'c',3,'d'), stringsAsFactors = F)
b<-data.frame(cbind(1,'a',2,'c',3,'g'), stringsAsFactors = F)
c<-data.frame(cbind(1,'f',4,'g',5,'h'), stringsAsFactors = F)
x<-rbind(a,b,c)
fun<-function(x){
fin<-NULL
for(i in 1:nrow(x)){
v<-x[i+1,]
if ((x[i,1]== v[i,1]) & (x[i,2]==v[i,2]) ){
x[i,3]<-"f"
fin<-rbind(fin, x[i,])
}else {fin<-rbind(fin, x[i,]) }
return(fin)
}
}
fun(x)
X1 X2 X3 X4 X5 X6
1 1 a f c 3 d
>
The result I desire:
X1 X2 X3 X4 X5 X6
1 1 a f c 3 d
1 1 a 2 c 3 g
1 1 f 4 g 5 h
Or an alternative:
library(dplyr)
library(magrittr)
> z <- x %>% mutate(match = ifelse(( (lead(X1)==X1) & (lead(X2)==X2)),"YES","NO"))
> z %>% mutate(X3 = replace(X3, match=="YES", "f"))
X1 X2 X3 X4 X5 X6 match
1 1 a f c 3 d YES
2 1 a 2 c 3 g NO
3 1 f 4 g 5 h <NA>
My Data looks like
df <- data.frame(user_id=c('13','15'),
answer_id = c('{"row[0][0]":"A","row[0][1]":"B","row[0][2]":"C","row[0][3]":"D","row[1][0]":"A1","row[1][1]":"B1","row[1][2]":"C1","row[1][3]":"D1"}', '{"row[0][0]":"W","row[0][1]":"X","row[0][2]":"Y","row[0][3]":"Z","row[1][0]":"W1","row[1][1]":"X1","row[1][2]":"Y1","row[1][3]":"Z1"}
'))
Desired data view
user_id answer_id1 answer_id2 answer_id3 answer_id4
13 A B C D
13 A1 B1 C1 D1
15 W X Y Z
15 W1 X1 Y1 Z1
i'm new with R and hope to get solution soon as i do always
may not be the best solution but this can get you from your sample input to your desired output using stringr, purrr, & tidyr. See regex101 for an explanation of the regex used in the stringr::str_match_all() call.
df <- data.frame(user_id=c('13','15'),
answer_id = c('{"row[0][0]":"A","row[0][1]":"B","row[0][2]":"C","row[0][3]":"D","row[1][0]":"A1","row[1][1]":"B1","row[1][2]":"C1","row[1][3]":"D1"}', '{"row[0][0]":"W","row[0][1]":"X","row[0][2]":"Y","row[0][3]":"Z","row[1][0]":"W1","row[1][1]":"X1","row[1][2]":"Y1","row[1][3]":"Z1"}'),
stringsAsFactors=F)
#use regex to extract row ids and answers
regex_matches <- stringr::str_match_all(df$answer_id, '\\"row\\[(\\d+)\\]\\[(\\d+)\\]\\":\\"([^\\"]*)\\"')
#add user id to each result
answers_by_user <- purrr::map2(df$user_id, regex_matches, ~cbind(.x, .y[,-1]))
#combine list of matrices and convert to df
answers_df <- data.frame(do.call(rbind, answers_by_user))
#add meaningful names
names(answers_df) <- c("user_id", "row_1", "row_2", "value")
#convert to wide
spread_row_1 <- tidyr::spread(answers_df, row_1, value)
final_df <- tidyr::spread(answers_df, row_2, value)
#remove row column
final_df$row_1 <- NULL
#clean up names
names(final_df) <- c("user_id", "answer_id1", "answer_id2", "answer_id3", "answer_id4")
final_df
#output
user_id answer_id1 answer_id2 answer_id3 answer_id4
1 13 A B C D
2 13 A1 B1 C1 D1
3 15 W X Y Z
4 15 W1 X1 Y1 Z1
Column 2 looks like JSON, so you could do something like this to get it into a form that you can do something with...
library(rjson)
df2 <- lapply(1:nrow(df),function(i)
data.frame(user=df[i,1],
answer=unlist(fromJSON(as.character(df[i,2]))),stringsAsFactors = FALSE))
df2 <- do.call(rbind,df2)
df2[,"r1"] <- gsub(".+\\[(\\d)]\\[(\\d)].*","\\1",rownames(df2))
df2[,"r2"] <- gsub(".+\\[(\\d)]\\[(\\d)].*","\\2",rownames(df2))
df2
user answer r1 r2
row[0][0] 13 A 0 0
row[0][1] 13 B 0 1
row[0][2] 13 C 0 2
row[0][3] 13 D 0 3
row[1][0] 13 A1 1 0
row[1][1] 13 B1 1 1
row[1][2] 13 C1 1 2
row[1][3] 13 D1 1 3
row[0][0]1 15 W 0 0
row[0][1]1 15 X 0 1
row[0][2]1 15 Y 0 2
row[0][3]1 15 Z 0 3
row[1][0]1 15 W1 1 0
row[1][1]1 15 X1 1 1
row[1][2]1 15 Y1 1 2
row[1][3]1 15 Z1 1 3
The data contains four fields: id, x1, x2, and x3.
id <- c(1,2,3,4,5,6,7,8,9,10)
x1 <- c(2,4,5,3,6,4,3,6,7,7)
x2 <- c(0,1,2,6,7,6,0,8,2,2)
x3 <- c(5,3,4,5,8,3,4,2,5,6)
DF <- data.frame(id, x1,x2,x3)
Before I ask the question, let me create a new field (minX) which is the min of (x1,x2,x3)
DF$minX <- pmin(DF$x1, DF$x2, DF$x3)
I need to create a new field, y, that is defined as follows
if min(x1,x2,x3) = x1, then y = "x1"
if min(x1,x2,x3) = x2, then y = "x2"
if min(x1,x2,x3) = x3, then y = "x3"
Note: we assume no ties.
As a simply solution, do:
VARS <- colnames(DF)[-1]
y <- VARS[apply(DF[, -1], MARGIN = 1, FUN = which.min)]
DF$y <- y
The function which.min returns the index of the minimum. If the minimum is not unique it returns the first one. Since you guarantee that there is no tie, this is not an issue here.
Finally, you should be familiar with apply, right? MARGIN = 1 means applying function FUN row-wise, while MARGIN = 2 means applying FUN column-wise. This is an useful function to avoid the need for a for loop when dealing with matrix. Since your data frame only contains numerical/integer values, it is like a matrix hence we can use apply.
Here is another option using pmin and max.col
library(data.table)
setDT(DF)[, c("minx", "y") := list(do.call(pmin, .SD),
names(.SD)[max.col(-1*.SD)]), .SDcols= x1:x3]
DF
# id x1 x2 x3 minx y
# 1: 1 2 0 5 0 x2
# 2: 2 4 1 3 1 x2
# 3: 3 5 2 4 2 x2
# 4: 4 3 6 5 3 x1
3 5: 5 6 7 8 6 x1
# 6: 6 4 6 3 3 x3
# 7: 7 3 0 4 0 x2
# 8: 8 6 8 2 2 x3
# 9: 9 7 2 5 2 x2
#10: 10 7 2 6 2 x2
a data.table solution:
# create variables
id <- c(1,2,3,4,5,6,7,8,9,10)
x1 <- c(2,4,5,3,6,4,3,6,7,7)
x2 <- c(0,1,2,6,7,6,0,8,2,2)
x3 <- c(5,3,4,5,8,3,4,2,5,6)
DF <- data.frame(id, x1,x2,x3)
# load package and set data table, calculating min
library(data.table)
setDT(DF)[, minx := apply(.SD, 1, min), .SDcols=c("x1", "x2", "x3")]
# Create variable with name of minimum
DF[, y := apply(.SD, 1, function(x) names(x)[which.min(x)]), .SDcols = c("x1", "x2", "x3")]
# call result
DF
## id x1 x2 x3 minx y
1: 1 2 0 5 0 x2
2: 2 4 1 3 1 x2
3: 3 5 2 4 2 x2
4: 4 3 6 5 3 x1
5: 5 6 7 8 6 x1
6: 6 4 6 3 3 x3
7: 7 3 0 4 0 x2
8: 8 6 8 2 2 x3
9: 9 7 2 5 2 x2
10: 10 7 2 6 2 x2
The last step can be called directly, without the need to calculate minx.
Please notice that data.table is particularily fast in large data sets.
######## EDIT TO ADD: DPLYR METHOD #########
For completeness, this would be a dplyr method to produce the same (final) result. This solution is credited to #eipi10 in a question I started out of this problem (see here):
DF %>% mutate(y = apply(.[,2:4], 1, function(x) names(x)[which.min(x)]))
This solution takes about the same time as the data.table one provided in the original answer, when applyed to a 1e6 rows data frame (about 17 secs in my sony laptop).
I want to create a matrix with similarities based on two identifiers, consider following matrix:
x1 <- c(2,2,2,3,1,2,4,6,4)
y1 <- c(5,4,3,3,4,2,1,6,3)
x2 <- c(8,2,7,3,1,2,2,2,6)
y2 <- c(1,3,3,3,1,2,4,3,8)
x3 <- c(4,4,1,2,4,6,3,2,9)
y3 <- c(1,2,3,3,1,2,4,6,1)
id1 <- c("a","a","a","a","b","b","b","b","b")
id2 <- c(2002,2002,2003,2003,2002,2002,2003,2003,2003)
dat <- data.frame(x1,y1,x2,y2,x3,y3,id1,id2)
For the groups marked by id1 and id2 I want to create the euclidean distance (sqrt((x1a-x1b)^2+(y1a-y1b)^2 + ... + (y3a-y3b)^2)) between the lines in the dataset. In the best case, there would be a new variable that indicates the distances of each line to each other line with the same id1 and id2. Please note that different numbers of members can be in each group as for instance in 2003 in the b-group there are three cases.
Any advice would be great!!!
I think it would be a good idea first to distinguish the lines whose distances you want to calculate. For example, for id1 == b and id2 == 2003 you have 3 lines, and you want to calculate 3 different distances (between each possible pair). So let's first assign each of these a unique id.
f <- function(n) {
# Returns a vector
# 1, 2, 1, 3, ..., 1, n, 2, 3, 2, 4, ..., 2, n, ..., (n-1), n
m <- matrix(ncol = 2, nrow = n * (n-1) / 2)
m[, 1] <- rep(1:(n-1), (n-1):1)
m[, 2] <- unlist(lapply(2:n, function(x) x:n))
as.numeric(t(m))
}
# Alternatively,
# f <- function(n) {
# d <- expand.grid(a = 1:n, b = 1:n)
# d <- d[d$a < d$b, ]
# unlist(d)
# }
# but this is slower
# Using plyr...
library(plyr)
dat <- ddply(dat, .(id1, id2), function(d) {
d <- d[f(nrow(d)), ]
d$id3 <- paste0(d$id1, rep(1:(nrow(d) / 2), each = 2))
d
})
# ...or using base R
dat <- do.call(rbind,
by(dat, list(dat$id1, dat$id2), function(d) {
d <- d[f(nrow(d)), ]
d$id3 <- paste0(d$id1, rep(1:(nrow(d) / 2), each = 2))
d
}))
Now there will only be two lines for each (id3, id2) pair and you can calculate the differences as follows
# Using plyr
result <- ddply(dat, .(id3, id2), function(d) {
d <- d[paste0(rep(c("x", "y"), 3), 1:3)]
d$dist <- sqrt(sum((d[1, ] - d[2, ])^2))
d
})
# Base R
result <- do.call(rbind,
by(dat[paste0(rep(c("x", "y"), 3), 1:3)],
list(dat$id3, dat$id2),
function(d){
d$dist <- sqrt(sum((d[1, ] - d[2, ])^2))
d
}
))
result[c("id3", "id2")] <- dat[c("id3", "id2")]
result
# x1 y2 x3 y1 x2 y3 dist id3 id2
# 1 2 1 4 5 8 1 6.480741 a1 2002
# 2 2 3 4 4 2 2 6.480741 a1 2002
# 5 1 1 4 4 1 1 3.464102 b1 2002
# 6 2 2 6 2 2 2 3.464102 b1 2002
# 3 2 3 1 3 7 3 4.242641 a1 2003
# 4 3 3 2 3 3 3 4.242641 a1 2003
# 7 4 4 3 1 2 4 5.916080 b1 2003
# 8 6 3 2 6 2 6 5.916080 b1 2003
# 7.1 4 4 3 1 2 4 9.000000 b2 2003
# 9 4 8 9 3 6 1 9.000000 b2 2003
# 8.1 6 3 2 6 2 6 11.313708 b3 2003
# 9.1 4 8 9 3 6 1 11.313708 b3 2003
Maybe this could be helpful.
dist(dat[which(dat[,"id1"]=="a" & dat[,"id2"]=="2002"),], method ="euclidean")
dist(dat[which(dat[,"id1"]=="b" & dat[,"id2"]=="2003"),], method ="euclidean")