Get indices from each row and merge with original data.frame - r

I have the following data.frame
user_id 1 2 3 4 5 6 7 8 9
1 54449024717783 0 0 1 0 0 0 0 0 0
2 117592134783793 0 0 0 0 0 1 0 0 0
3 187145545782493 0 0 1 0 0 0 0 0 0
4 245003020993334 0 0 0 0 0 1 0 0 0
5 332625230637592 0 1 0 0 0 0 0 0 0
6 336336752713947 0 1 0 0 0 0 0 0 0
what I would like to do is to create one column (and remove 1:9) and insert the column name where I have the value 1 , each user contain only column with the value 1 ,
If im running the following function:
rowSums(users_cluster(users_cluster), dims = 1)
it will summarize all the rows value but I need to duplicate it with the column name

Base R solution:
data.frame(user_id = df[, 1],
name = which(t(df[, -1] == 1)) %% (ncol(df) - 1))
# user_id name
# 1 54449024717783 3
# 2 117592134783793 6
# 3 187145545782493 3
# 4 245003020993334 6
# 5 332625230637592 2
# 6 336336752713947 2

Here's another base R option:
inds <- which(df[,-1]!=0,TRUE)
df$newcol <- inds[order(row.names(inds)),][,2]
df[,c(1,11)]
# user_id newcol
#1 5.444902e+13 3
#2 1.175921e+14 6
#3 1.871455e+14 3
#4 2.450030e+14 6
#5 3.326252e+14 2
#6 3.363368e+14 2

Another approach is max.col from base R as the user specified each user contain only column with the value 1
cbind(dat[1], ind = max.col(dat[-1], 'first'))
# user_id ind
#1 54449024717783 3
#2 117592134783793 6
#3 187145545782493 3
#4 245003020993334 6
#5 332625230637592 2
#6 336336752713947 2

Another base R solution:
df$ind = apply(df[,-1]>0,1,which)
df[,c("user_id","ind")]
Output:
user_id ind
1 5.444902e+13 3
2 1.175921e+14 6
3 1.871455e+14 3
4 2.450030e+14 6
5 3.326252e+14 2
6 3.363368e+14 2

A solution using the tidyverse.
library(tidyverse)
dat2 <- dat %>%
mutate(ID = 1:n()) %>%
gather(Column, Value, -user_id, -ID) %>%
filter(Value == 1) %>%
arrange(ID) %>%
select(-Value, -ID) %>%
as.data.frame()
dat2
# user_id Column
# 1 54449024717783 3
# 2 117592134783793 6
# 3 187145545782493 3
# 4 245003020993334 6
# 5 332625230637592 2
# 6 336336752713947 2
DATA
dat <- read.table(text = " user_id 1 2 3 4 5 6 7 8 9
1 54449024717783 0 0 1 0 0 0 0 0 0
2 117592134783793 0 0 0 0 0 1 0 0 0
3 187145545782493 0 0 1 0 0 0 0 0 0
4 245003020993334 0 0 0 0 0 1 0 0 0
5 332625230637592 0 1 0 0 0 0 0 0 0
6 336336752713947 0 1 0 0 0 0 0 0 0",
header = TRUE, stringsAsFactors = FALSE)
library(tidyverse)
dat <- as.tibble(dat) %>%
setNames(sub("X", "", names(.))) %>%
mutate(user_id = as.character(user_id))

For the sake of completeness, here is also a data.table solution which uses melt() to reshape from wide to long format:
library(data.table)
melt(setDT(DF), id = "user_id")[value == 1L][order(user_id), !"value"]
user_id variable
1: 54449024717783 3
2: 117592134783793 6
3: 187145545782493 3
4: 245003020993334 6
5: 332625230637592 2
6: 336336752713947 2
This takes advantage of the fact that the sample dataset is already sorted by ascending user_id.
In case the sample dataset has a different order which should be maintained in the final result, it is necessary to remember that order by introducing a temporary row id:
melt(setDT(DF), id = "user_id")[, rn := rowid(variable)][value == 1L][
order(rn), !c("rn", "value")]
or, alternatively,
melt(setDT(DF), id = "user_id")[, rn := rowid(variable)][, setorder(.SD, rn)][
value == 1L, !c("rn", "value")]
Data
library(data.table)
DF <- fread(
"i user_id 1 2 3 4 5 6 7 8 9
1 54449024717783 0 0 1 0 0 0 0 0 0
2 117592134783793 0 0 0 0 0 1 0 0 0
3 187145545782493 0 0 1 0 0 0 0 0 0
4 245003020993334 0 0 0 0 0 1 0 0 0
5 332625230637592 0 1 0 0 0 0 0 0 0
6 336336752713947 0 1 0 0 0 0 0 0 0"
, drop = 1L)[, lapply(.SD, as.integer), by = user_id]

Related

adding together multiple sets of columns in r

I'm trying to add several sets of columns together.
Example df:
df <- data.frame(
key = 1:5,
ab0 = c(1,0,0,0,1),
ab1 = c(0,2,1,0,0),
ab5 = c(1,0,0,0,1),
bc0 = c(0,1,0,2,0),
bc1 = c(2,0,0,0,0),
bc5 = c(0,2,1,0,1),
df0 = c(0,0,0,1,0),
df1 = c(1,0,3,0,0),
df5 = c(1,0,0,0,6)
)
Giving me:
key ab0 ab1 ab5 bc0 bc1 bc5 df0 df1 df5
1 1 1 0 1 0 2 0 0 1 1
2 2 0 2 0 1 0 2 0 0 0
3 3 0 1 0 0 0 1 0 3 0
4 4 0 0 0 2 0 0 1 0 0
5 5 1 0 1 0 0 1 0 0 6
I want to add all sets of columns with 0s and 5s in them together and place them in the 0 column.
So the end result would be:
key ab0 ab1 ab5 bc0 bc1 bc5 df0 df1 df5
1 1 2 0 1 0 2 0 0 1 1
2 2 0 2 0 3 0 2 0 0 0
3 3 0 1 0 1 0 1 0 3 0
4 4 0 0 0 2 0 0 2 0 0
5 5 2 0 1 1 0 1 0 0 6
I could add the columns together using 3 lines:
df$ab0 <- df$ab0 + df$ab5
df$bc0 <- df$bc0 + df$bc5
df$df0 <- df$df0 + df$df5
But my real example has over a hundred columns so I'd like to iterate over them and use apply.
The column names of the first set are contained in col0 and the names of the second set are in col5.
col0 <- c("ab0","bc0","df0")
col5 <- c("ab5","bc5","df5")
I created a function to add the columns to gether using mapply:
fun1 <- function(df,x,y) {
df[,x] <- df[,x] + df[,y]
}
mapply(fun1,df,col0,col5)
But I get an error: Error in df[, x] : incorrect number of dimensions
Thoughts?
Simply add two data frames together by their subsetted columns, assuming they will be the same length. No loops needed. All vectorized operation.
final_df <- df[grep("0", names(df))] + df[grep("5", names(df))]
final_df <- cbind(final_df, df[grep("0", names(df), invert=TRUE)])
final_df <- final_df[order(names(final_df))]
final_df
# ab0 ab1 ab5 bc0 bc1 bc5 df0 df1 df5 key
# 1 2 0 1 0 2 0 1 1 1 1
# 2 0 2 0 3 0 2 0 0 0 2
# 3 0 1 0 1 0 1 0 3 0 3
# 4 0 0 0 2 0 0 1 0 0 4
# 5 2 0 1 1 0 1 6 0 6 5
Rextester demo
You could use map2 from the purrr package to iterate over the two vectors at once:
df <- data.frame(
key = 1:5,
ab0 = c(1,0,0,0,1),
ab1 = c(0,2,1,0,0),
ab5 = c(1,0,0,0,1),
bc0 = c(0,1,0,2,0),
bc1 = c(2,0,0,0,0),
bc5 = c(0,2,1,0,1),
df0 = c(0,0,0,1,0),
df1 = c(1,0,3,0,0),
df5 = c(1,0,0,0,6)
)
col0 <- c("ab0","bc0","df0")
col5 <- c("ab5","bc5","df5")
purrr::map2(col0, col5, function(x, y) {
df[[x]] <<- df[[x]] + df[[y]]
})
> df
key ab0 ab1 ab5 bc0 bc1 bc5 df0 df1 df5
1 1 2 0 1 0 2 0 1 1 1
2 2 0 2 0 3 0 2 0 0 0
3 3 0 1 0 1 0 1 0 3 0
4 4 0 0 0 2 0 0 1 0 0
5 5 2 0 1 1 0 1 6 0 6
Here's an approach using tidyr and dplyr from the tidyverse meta-package.
First, I bring the table into long ("tidy") format, and split out the column into two components, and spread by the number part of those components.
Then I do the calculation you describe.
Finally, I bring it back into the original format using the inverse of step 1.
library(tidyverse)
df_tidy <- df %>%
# Step 1
gather(col, value, -key) %>%
separate(col, into = c("grp", "num"), 2) %>%
spread(num, value) %>%
# Step 2
mutate(`0` = `0` + `5`) %>%
# Step 3, which is just the inverse of Step 1.
gather(num, value, -key, - grp) %>%
unite(col, c("grp", "num")) %>%
spread(col, value)
df_tidy
key ab_0 ab_1 ab_5 bc_0 bc_1 bc_5 df_0 df_1 df_5
1 1 2 0 1 0 2 0 1 1 1
2 2 0 2 0 3 0 2 0 0 0
3 3 0 1 0 1 0 1 0 3 0
4 4 0 0 0 2 0 0 1 0 0
5 5 2 0 1 1 0 1 6 0 6

Split column of comma-separated numbers into multiple columns based on value

I have a column f in my dataframe that I would like to spread into multiple columns based on the values in that column. For example:
df <- structure(list(f = c(NA, "18,17,10", "12,8", "17,11,6", "18",
"12", "12", NA, "17,11", "12")), .Names = "f", row.names = c(NA,
10L), class = "data.frame")
df
# f
# 1 <NA>
# 2 18,17,10
# 3 12,8
# 4 17,11,6
# 5 18
# 6 12
# 7 12
# 8 <NA>
# 9 17,11
# 10 12
How would I split column f into multiple columns indicating the numbers in the row. I'm interested in something like this:
6 8 10 11 12 17 18
1 0 0 0 0 0 0 0
2 0 0 1 0 0 1 1
3 0 1 0 0 1 0 0
4 1 0 0 1 0 1 0
5 0 0 0 0 0 0 1
6 0 0 0 0 1 0 0
7 0 0 0 0 1 0 0
8 0 0 0 0 0 0 0
9 0 0 0 1 0 1 0
10 0 0 0 0 1 0 0
I'm thinking I could useunique on the f column to create the seperate columns based on the different numbers and then do a grepl to determine if the specific number is in column f but I was wondering if there was a better way. Something similar to spread or separate in the tidyr package.
A solution using tidyr::separate_rows will be as:
library(tidyverse)
df %>% mutate(ind = row_number()) %>%
separate_rows(f, sep=",") %>%
mutate(f = ifelse(is.na(f),0, f)) %>%
count(ind, f) %>%
spread(f, n, fill = 0) %>%
select(-2) %>% as.data.frame()
# ind 10 11 12 17 18 6 8
# 1 1 0 0 0 0 0 0 0
# 2 2 1 0 0 1 1 0 0
# 3 3 0 0 1 0 0 0 1
# 4 4 0 1 0 1 0 1 0
# 5 5 0 0 0 0 1 0 0
# 6 6 0 0 1 0 0 0 0
# 7 7 0 0 1 0 0 0 0
# 8 8 0 0 0 0 0 0 0
# 9 9 0 1 0 1 0 0 0
# 10 10 0 0 1 0 0 0 0
This could be achieved by splitting on the ,, the stack it to a two column data.frame and get the frequency with table
df1 <- na.omit(stack(setNames(lapply(strsplit(df$f, ","),
as.numeric), seq_len(nrow(df))))[, 2:1])
table(df1)
# values
#ind 6 8 10 11 12 17 18
# 1 0 0 0 0 0 0 0
# 2 0 0 1 0 0 1 1
# 3 0 1 0 0 1 0 0
# 4 1 0 0 1 0 1 0
# 5 0 0 0 0 0 0 1
# 6 0 0 0 0 1 0 0
# 7 0 0 0 0 1 0 0
# 8 0 0 0 0 0 0 0
# 9 0 0 0 1 0 1 0
# 10 0 0 0 0 1 0 0

r - create a sequence only for certain values

Does anyone has an idea how to achieve the following: start from here
df <- data.frame(var = c(0,0,1,1,0,0,0,1,1,0,0,0,0,1,1))
and achieve this:
df <- data.frame(var = c(0,0,1,1,0,0,0,1,1,0,0,0,0,1,1),
newvar = c(0,0,1,1,0,0,0,2,2,0,0,0,0,3,3))
Here is an option with rle by replacing the 'values' that are not 0 with the sequence of those values and then call inverse_rle to get the full vector
df$newvar <- inverse.rle(within.list(rle(df$var),
values[values!=0] <- seq_along(values[values!=0])))
df
# var newvar
#1 0 0
#2 0 0
#3 1 1
#4 1 1
#5 0 0
#6 0 0
#7 0 0
#8 1 2
#9 1 2
#10 0 0
#11 0 0
#12 0 0
#13 0 0
#14 1 3
#15 1 3
you can try following:
df %>%
mutate(n=ifelse(var==lead(var,default = 0),1,0)) %>%
mutate(n2=ifelse(var==0,0,n)) %>%
mutate(res=ifelse(var==1, cumsum(n2),0))
var n n2 res
1 0 1 0 0
2 0 0 0 0
3 1 1 1 1
4 1 0 0 1
5 0 1 0 0
6 0 1 0 0
7 0 0 0 0
8 1 1 1 2
9 1 0 0 2
10 0 1 0 0
11 0 1 0 0
12 0 1 0 0
13 0 0 0 0
14 1 1 1 3
15 1 0 0 3
Then select(var, res) only the columns you need.
An efficient solution:
df %>%
mutate(temp= var - lag(var,default=df$var[1])) %>%
mutate(newvar= var * cumsum(temp>0))
or without the additional column:
df %>%
mutate(newvar= var - lag(var,default=df$var[1])) %>%
mutate(newvar= var * cumsum(newvar>0))
var temp newvar
1 0 0 0
2 0 0 0
3 1 1 1
4 1 0 1
5 0 -1 0
6 0 0 0
7 0 0 0
8 1 1 2
9 1 0 2
10 0 -1 0
11 0 0 0
12 0 0 0
13 0 0 0
14 1 1 3
15 1 0 3
Another one for fun:
library(data.table)
setDT(df)
tmp = 0
df[, newvar := if(var[1] != 0) tmp <- tmp + 1 else 0, by = rleid(var)][]
And another one:
df[, newvar := var * cumsum(diff(c(0, var)) == 1)]
# or if still a data.frame
within(df, newvar <- var * cumsum(diff(c(0, var)) == 1))

adding data frame of counts to template data frame in R

I have data.frames of counts such as:
a <- data.frame(id=1:10,
"1"=c(rep(1,3),rep(0,7)),
"3"=c(rep(0,4),rep(1,6)))
names(a)[2:3] <- c("1","3")
a
> a
id 1 3
1 1 1 0
2 2 1 0
3 3 1 0
4 4 0 0
5 5 0 1
6 6 0 1
7 7 0 1
8 8 0 1
9 9 0 1
10 10 0 1
and a template data.frame such as
m <- data.frame(id=1:10,
"1"= rep(0,10),
"2"= rep(0,10),
"3"= rep(0,10),
"4"= rep(0,10))
names(m)[-1] <- 1:4
m
> m
id 1 2 3 4
1 1 0 0 0 0
2 2 0 0 0 0
3 3 0 0 0 0
4 4 0 0 0 0
5 5 0 0 0 0
6 6 0 0 0 0
7 7 0 0 0 0
8 8 0 0 0 0
9 9 0 0 0 0
10 10 0 0 0 0
and I want to add the values of a into the template m
in the appropraite columns, leaving the rest as 0.
This is working but I would like to know
if there is a more elegant way, perhaps using plyr or data.table:
provi <- rbind.fill(a,m)
provi[is.na(provi)] <- 0
mnew <- aggregate(provi[,-1],by=list(provi$id),FUN=sum)
names(mnew)[1] <- "id"
mnew <- mnew[c(1,order(names(mnew)[-1])+1)]
mnew
> mnew
id 1 2 3 4
1 1 1 0 0 0
2 2 1 0 0 0
3 3 1 0 0 0
4 4 0 0 0 0
5 5 0 0 1 0
6 6 0 0 1 0
7 7 0 0 1 0
8 8 0 0 1 0
9 9 0 0 1 0
10 10 0 0 1 0
I guess the concise option would be:
m[names(a)] <- a
Or we match the column names ('i1'), use that to create the column index with max.col, cbind with the row index ('i2'), and a similar step can be done to create 'i3'. We change the values in 'm' corresponding to 'i2' with the 'a' values based on 'i3'.
i1 <- match(names(a)[-1], names(m)[-1])
i2 <- cbind(m$id, i1[max.col(a[-1], 'first')]+1L)
i3 <- cbind(a$id, max.col(a[-1], 'first')+1L)
m[i2] <- a[i3]
m
# id 1 2 3 4
#1 1 1 0 0 0
#2 2 1 0 0 0
#3 3 1 0 0 0
#4 4 0 0 0 0
#5 5 0 0 1 0
#6 6 0 0 1 0
#7 7 0 0 1 0
#8 8 0 0 1 0
#9 9 0 0 1 0
#10 10 0 0 1 0
A data.table option would be melt/dcast
library(data.table)
dcast(melt(setDT(a), id.var='id')[,
variable:= factor(variable, levels=1:4)],
id~variable, value.var='value', drop=FALSE, fill=0)
# id 1 2 3 4
# 1: 1 1 0 0 0
# 2: 2 1 0 0 0
# 3: 3 1 0 0 0
# 4: 4 0 0 0 0
# 5: 5 0 0 1 0
# 6: 6 0 0 1 0
# 7: 7 0 0 1 0
# 8: 8 0 0 1 0
# 9: 9 0 0 1 0
#10: 10 0 0 1 0
A similar dplyr/tidyr option would be
library(dplyr)
library(tidyr)
gather(a, Var, Val, -id) %>%
mutate(Var=factor(Var, levels=1:4)) %>%
spread(Var, Val, drop=FALSE, fill=0)
You could use merge, too:
res <- suppressWarnings(merge(a, m, by="id", suffixes = c("", "")))
(res[, which(!duplicated(names(res)))][, names(m)])
# id 1 2 3 4
# 1 1 1 0 0 0
# 2 2 1 0 0 0
# 3 3 1 0 0 0
# 4 4 0 0 0 0
# 5 5 0 0 1 0
# 6 6 0 0 1 0
# 7 7 0 0 1 0
# 8 8 0 0 1 0
# 9 9 0 0 1 0
# 10 10 0 0 1 0

Faster way to multiplication in data frame

I have a data frame (name t) like this
ID N com_a com_b com_c
A 3 1 0 0
A 5 0 1 0
B 1 1 0 0
B 1 0 1 0
B 4 0 0 1
B 4 1 0 0
I have try to do com_a*N com_b*N com_c*N
ID N com_a com_b com_c com_a_N com_b_N com_c_N
A 3 1 0 0 3 0 0
A 5 0 1 0 0 5 0
B 1 1 0 0 1 0 0
B 1 0 1 0 0 1 0
B 4 0 0 1 0 0 4
B 4 1 0 0 4 0 0
I use for-function, but it need many time how do i do the fast in the big data
for (i in 1:dim(t)[1]){
t$com_a_N[i]=t$com_a[i]*t$N[i]
t$com_b_N[i]=t$com_b[i]*t$N[i]
t$com_c_N[i]=t$com_c[i]*t$N[i]
}
t <- transform(t,
com_a_N=com_a*N,
com_b_N=com_b*N,
com_c_N=com_c*N)
should be much faster. data.table solutions might be faster still.
You can use sweep for this
(st <- sweep(t[, 3:5], 1, t$N, "*"))
# com_a com_b com_c
#1 3 0 0
#2 0 5 0
#3 1 0 0
#4 0 1 0
#5 0 0 4
#6 4 0 0
The new names can be created with paste and setNames, and you can add the new columns to the existing data.frame with cbind. This will scale for any number of columns.
cbind(t, setNames(st, paste(names(st), "N", sep="_")))
# ID N com_a com_b com_c com_a_N com_b_N com_c_N
#1 A 3 1 0 0 3 0 0
#2 A 5 0 1 0 0 5 0
#3 B 1 1 0 0 1 0 0
#4 B 1 0 1 0 0 1 0
#5 B 4 0 0 1 0 0 4
#6 B 4 1 0 0 4 0 0
A data.table solution as proposed by #BenBolker
library(data.table)
setDT(t)[, c("com_a_N", "com_b_N", "com_c_N") := list(com_a*N, com_b*N, com_c*N)]
## ID N com_a com_b com_c com_a_N com_b_N com_c_N
## 1: A 3 1 0 0 3 0 0
## 2: A 5 0 1 0 0 5 0
## 3: B 1 1 0 0 1 0 0
## 4: B 1 0 1 0 0 1 0
## 5: B 4 0 0 1 0 0 4
## 6: B 4 1 0 0 4 0 0
Even faster using matrix multiplication:
cbind(dat,dat[,3:5]*dat$N)
Though you should set colnames after....
To avoid using explicit column index(not recommended) , you can use some grep magic:
cbind(dat,dat[,grep('com',colnames(dat))]*dat$N)
Another option with dplyr:
require(dplyr)
t <- mutate(t, com_a_N=com_a*N,
com_b_N=com_b*N,
com_c_N=com_c*N)

Resources