data frame manipulation in R - r

I have a data frame that looks like this:
id = c("A","B","C","A","C","C")
val = c(5,4,6,7,10,99)
df = data.frame(id, val)
df
id val
A 5
B 4
C 6
A 7
C 10
C 99
Now I would like to re-arrange the id column (A, B, C...), keep their corresponding val, and then add a new column of newid starting with letter E, followed by three digits counting the number of id in the first column. The code is here:
id2 = c("A","A","B","C","C","C")
val2 = c(5,7,4,6,10,99)
newid = c("E001","E002","E001","E001","E002","E003")
df2 = data.frame(id2, val2, newid)
df2
and the final result is this:
id2 val2 newid
A 5 E001
A 7 E002
B 4 E001
C 6 E001
C 10 E002
C 99 E003
Is there an efficient way to do this?

library(data.table)
dt = data.table(df)
dt[, newid := paste0('E', gsub(' ', '0', format(1:.N, width = 3))), keyby = id]
dt
# id val newid
#1: A 5 E001
#2: A 7 E002
#3: B 4 E001
#4: C 6 E001
#5: C 10 E002
#6: C 99 E003
keyby here does the sorting, so no need to do it explicitly

Here is one way to do that, using the order() function to arrange the data, and the sprintf(), sapply() and table() functions to define newid.
df2 <- df[order(df$id, df$val), ]
df2$newid <- paste0("E", sprintf("%04d", unlist(sapply(table(df$id), function(x) 1:x))))

Related

R summarize character counts

library(data.table)
DATA=data.table(STUDENT= c(1,2,3,4),
DOG_1= c("a","e","a","c"),
DOG_2= c("a","e","d","b"),
DOG_3= c("a","d","b","c"),
CAT_1= c("c","a","d","c"),
CAT_2= c("c","d","a","b"),
MOUSE_1= c("d","b","e","b"),
MOUSE_2= c("c","a","b","e"),
MOUSE_3= c("a","b","b","e"),
MOUSE_4= c("b","c","a","d"))
This is how my data looks like above. I wish to end up with a new data that looks like this:
where 'a' equals to 1; 'b' equals to 2; 'c' equals to 3; 'd' equals to 4; 'e' equals to 5 and to get the value for example STUDENT 1 DOG equals to 3 is gotten by converting the letters to the values and summing up.
If we want to use data.table solution, melt the 'DATA', by specifying the patterns from the column names into 'long' format, then using a named vector ('keyval'), grouped by 'STUDENT, loop over the columns specified in .SDcols, match and replace the values with the integer values and sum
library(data.table)
nm1 <- unique(sub("_\\d+$", "", names(DATA)[-1]))
dt1 <- melt(DATA, id.var = 'STUDENT',
measure = patterns(nm1), value.name = nm1)
keyval <- setNames(1:5, letters[1:5])
dt1[, lapply(.SD, function(x) sum(keyval[x],
na.rm = TRUE)), by = STUDENT, .SDcols = nm1]
-output
# STUDENT DOG CAT MOUSE
#1: 1 3 6 10
#2: 2 14 5 8
#3: 3 7 5 10
#4: 4 8 5 16
A similar option in tidyverse would be
library(dplyr)
library(tidyr)
DATA %>%
pivot_longer(cols = -STUDENT, names_to = c('.value', 'grp'),
names_sep='_') %>%
group_by(STUDENT) %>%
summarise(across(all_of(nm1), ~ sum(keyval[.], na.rm = TRUE)))
# A tibble: 4 x 4
# STUDENT DOG CAT MOUSE
# <dbl> <int> <int> <int>
#1 1 3 6 10
#2 2 14 5 8
#3 3 7 5 10
#4 4 8 5 16
For the sake of completeness, here are two data.table approaches which use the new measure() function (available with data.table version 1.14.1) in the call to melt()
1. Melting, joining with a lookup table on-the-fly, casting
melt(DATA, measure.vars = measure(animal, rn, pattern = "(\\w+)_(\\d)"), value.name = "code")[
.(code = letters[1:5], value = 1:5), on = "code", value := i.value][
, dcast(.SD, STUDENT ~ animal, sum, value.var = "value")]
STUDENT CAT DOG MOUSE
1: 1 6 3 10
2: 2 5 14 8
3: 3 5 7 10
4: 4 5 8 16
2. Melting and summing factor levels
When the lettersa to e are turned into factors, the corresponding factor levels get the numeric values 1 to 5.
library(magrittr) # piping used to improve readability
melt(DATA, measure.vars = measure(value.name, rn, pattern = "(\\w+)_(\\d)"))[, rn := NULL][
, lapply(.SD, \(x) factor(x, levels = letters[1:5]) %>% as.integer() %>% sum(na.rm = TRUE)),
by = STUDENT]
STUDENT DOG CAT MOUSE
1: 1 3 6 10
2: 2 14 5 8
3: 3 7 5 10
4: 4 8 5 16
Another data.table option using melt + dcast
dcast(
melt(DATA, id.var = "STUDENT")[
,
c("variable", "value") := .(gsub("_.*", "", variable),
value = setNames(1:5, c("a", "b", "c", "d", "e"))[value]
)
], STUDENT ~ variable, sum
)
gives
STUDENT CAT DOG MOUSE
1: 1 6 3 10
2: 2 5 14 8
3: 3 5 7 10
4: 4 5 8 16

create new column by matching ID

I have the following example data.
data_1 <- data.frame("ID" = c('a','b','c','d','e'),
"value" = c(2,4,9,5,3))
data_2 <- data.frame("ID" = c('a','c','d','b','e','a','e','d','c'),
'var' =c(2,6,2,4,6,8,6,4,5))
I want to calculate new column in data_2 such that for the same ID in the two dataset, the value and var is multiplied.
Something like for data_1$ID==data_2$ID then data_1$value*data_2$var. So newVar would be (4,54,10,16,18,16,18,20,45).
Join the two dataframes and multiply value and var.
transform(merge(data_1, data_2, by = 'ID'), result = value * var)
You can also use match :
transform(data_2, result = var * data_1$value[match(ID, data_1$ID)])
# ID var result
#1 a 2 4
#2 c 6 54
#3 d 2 10
#4 b 4 16
#5 e 6 18
#6 a 8 16
#7 e 6 18
#8 d 4 20
#9 c 5 45
Using dplyr :
library(dplyr)
inner_join(data_1, data_2, by = 'ID') %>% mutate(result = value * var)
Using data.table
library(data.table)
setDT(data_1)[data_2, result := value * var, on = .(ID)]

R data.table: New data table with named columns and drop the rest

I want to do something very simple but so far I have failed to do it in one command. I want to create a new data table by applying a function to some columns of an existing one while giving them a name and droppinh the rest.
Let's see a minimal example:
library(data.table)
dt = data.table(A = c('a', 'a', 'a', 'b', 'b'),
B = c(1 , 2 , 3 , 4 , 5 ),
C = c(10 , 20 , 30 , 40 , 50))
dt
A B C
a 1 10
a 2 20
a 3 30
b 4 40
b 5 50
For a single column, we can do:
dt1 = dt[, .(totalB = sum(B)), by=A]
dt1
A totalB
a 6
b 9
For more than 1 columns, we can do:
dt2 = dt[, .(totalB = sum(B), totalC = sum(C)), by=A]
dt2
A totalB totalC
a 6 60
b 9 90
But if the columns are many that's not the best practice. So I guess we should go with lapply like that:
dt3 = dt[, lapply(.SD, sum), by = A]
dt3
A B C
a 6 60
b 9 90
That creates the table but without the names. So we can add them:
names = c("totalA", "totalB")
dt4 = dt[, c("totalA", "totalB") := lapply(.SD, sum), by = A ]
dt4
A B C totalA totalB
a 1 10 6 60
a 2 20 6 60
a 3 30 6 60
b 4 40 9 90
b 5 50 9 90
But now the columns remained. How can we prevent that? Also note that in my actual problem I use a subset of the columns, via SDcols, which I didn't include here for simplicity.
EDIT: My desired output is the same as dt2 but I don't want to write down all columns.
Do you mean something like below?
dt[, setNames(lapply(.SD, sum), paste0("total", names(.SD))), A]
Output
A totalB totalC
1: a 6 60
2: b 9 90
Another option is setnames. Create a vector of column names that we want to apply the function other than the grouping variable ('nm1'), grouped by 'A', get the sum, and use setnames with old and new specified
nm1 <- setdiff(names(dt), "A")
setnames(dt[, lapply(.SD, sum), A], nm1, paste0('total', nm1))[]

Replace values of multiple columns from one dataframe using another dataframe with conditions

Hi I have two data frames as followed:
df1:
ID x y z
1 a b c
2 a b c
3 a b c
4 a b c
and df2:
ID x y
2 d NA
3 NA e
and I am after a result like this:
df1:
ID x y z
1 a b c
2 d b c
3 a e c
4 a b c
I have been trying to use the match function as suggested by some other posts but I keep getting the issue where my df1 dataframe being replaced with NA values from df2.
This is the code I have been using without luck
for (i in names(df2)[2:length(names(df2))]) {
df1[i] <- df2[match(df1$ID, df2$ID)]
}
Thanks
Your code didn't work for me so I change it a little but it works. If you are reading data from an external file use the stringAsFactor = FALSE when you read it so you don't run into problems.
df1 = data.frame("ID" = 1:4,"x" = rep("a",4), "y" =rep("b",4),"z" = rep("c",4),
stringsAsFactors=FALSE)
df2 = data.frame("ID" = 2:3,"x" = c("d",NA), "y" = c(NA,"e"),stringsAsFactors=FALSE)
for(i in 1:nrow(df2)){
new_data = df2[i,-which(apply(df2[i,],2,is.na))]
pos = as.numeric(new_data[1])
col_replace = intersect(colnames(new_data),colnames(df1))
df1[pos,col_replace] = new_data
}
A solution using dplyr. The idea is to convert both data frames to long format, conduct join and replace the values, and convert the format back to wide format. df5 is the final output.
library(dplyr)
library(tidyr)
df3 <- df1 %>% gather(Col, Value, -ID)
df4 <- df2 %>% gather(Col, Value, -ID, na.rm = TRUE)
df5 <- df3 %>%
left_join(df4, by = c("ID", "Col")) %>%
mutate(Value.x = ifelse(!is.na(Value.y), Value.y, Value.x)) %>%
select(ID, Col, Value.x) %>%
spread(Col, Value.x)
df5
# ID x y z
# 1 1 a b c
# 2 2 d b c
# 3 3 a e c
# 4 4 a b c
DATA
df1 <- read.table(text = "ID x y z
1 a b c
2 a b c
3 a b c
4 a b c",
header = TRUE, stringsAsFactors = FALSE)
df2 <- read.table(text = "ID x y
2 d NA
3 NA e",
header = TRUE, stringsAsFactors = FALSE)
As mentioned by alistaire this is an update join. It is available with the data.table package:
library(data.table)
setDT(df1)
setDT(df2)
df1[df2, on = "ID", x := ifelse(is.na(i.x), x, i.x)]
df1[df2, on = "ID", y := ifelse(is.na(i.y), y, i.y)]
df1
ID x y z
1: 1 a b c
2: 2 d b c
3: 3 a e c
4: 4 a b c
If there are many columns with replacement values, it might be worthwhile to follow www's suggestion to do the replacement after reshaping to long format where column names are treated as data:
library(data.table)
melt(setDT(df1), "ID")[
melt(setDT(df2), "ID", na.rm = TRUE), on = .(ID, variable), value := i.value][
, dcast(.SD, ID ~ variable)]
ID x y z
1: 1 a b c
2: 2 d b c
3: 3 a e c
4: 4 a b c
Data
df1 <- fread(
"ID x y z
1 a b c
2 a b c
3 a b c
4 a b c")
df2 <- fread(
"ID x y
2 d NA
3 NA e")

Get number of same individuals for different groups

I have a data set with individuals (ID) that can be part of more than one group.
Example:
library(data.table)
DT <- data.table(
ID = rep(1:5, c(3:1, 2:3)),
Group = c("A", "B", "C", "B",
"C", "A", "A", "C",
"A", "B", "C")
)
DT
# ID Group
# 1: 1 A
# 2: 1 B
# 3: 1 C
# 4: 2 B
# 5: 2 C
# 6: 3 A
# 7: 4 A
# 8: 4 C
# 9: 5 A
# 10: 5 B
# 11: 5 C
I want to know the sum of identical individuals for 2 groups.
The result should look like this:
Group.1 Group.2 Sum
A B 2
A C 3
B C 3
Where Sum indicates the number of individuals the two groups have in common.
Here's my version:
# size-1 IDs can't contribute; skip
DT[ , if (.N > 1)
# simplify = FALSE returns a list;
# transpose turns the 3-length list of 2-length vectors
# into a length-2 list of 3-length vectors (efficiently)
transpose(combn(Group, 2L, simplify = FALSE)), by = ID
][ , .(Sum = .N), keyby = .(Group.1 = V1, Group.2 = V2)]
With output:
# Group.1 Group.2 Sum
# 1: A B 2
# 2: A C 3
# 3: B C 3
As of version 1.9.8 (on CRAN 25 Nov 2016), data.table has gained the ability to do non-equi joins. So, a self non-equi join can be used:
library(data.table) # v1.9.8+
setDT(DT)[, Group:= factor(Group)]
DT[DT, on = .(ID, Group < Group), nomatch = 0L, .(ID, x.Group, i.Group)][
, .N, by = .(x.Group, i.Group)]
x.Group i.Group N
1: A B 2
2: A C 3
3: B C 3
Explanantion
The non-equi join on ID, Group < Group is a data.table version of combn() (but applied group-wise):
DT[DT, on = .(ID, Group < Group), nomatch = 0L, .(ID, x.Group, i.Group)]
ID x.Group i.Group
1: 1 A B
2: 1 A C
3: 1 B C
4: 2 B C
5: 4 A C
6: 5 A B
7: 5 A C
8: 5 B C
We self-join with the same dataset on 'ID', subset the rows where the 'Group' columns are different, get the nrows (.N), grouped by the 'Group' columns, sort the 'Group.1' and 'Group.2' columns by row using pmin/pmax and get the unique value of 'N'.
library(data.table)#v1.9.6+
DT[DT, on='ID', allow.cartesian=TRUE][Group!=i.Group, .N ,.(Group, i.Group)][,
list(Sum=unique(N)) ,.(Group.1=pmin(Group, i.Group), Group.2=pmax(Group, i.Group))]
# Group.1 Group.2 Sum
#1: A B 2
#2: A C 3
#3: B C 3
Or as mentioned in the comments by #MichaelChirico and #Frank, we can convert 'Group' to factor class, subset the rows based on as.integer(Group) < as.integer(i.Group), group by 'Group', 'i.Group' and get the nrow (.N)
DT[, Group:= factor(Group)]
DT[DT, on='ID', allow.cartesian=TRUE][as.integer(Group) < as.integer(i.Group), .N,
by = .(Group.1= Group, Group.2= i.Group)]
Great answers above.
Just an alternative using dplyr in case you, or someone else, is interested.
library(dplyr)
cmb = combn(unique(dt$Group),2)
data.frame(g1 = cmb[1,],
g2 = cmb[2,]) %>%
group_by(g1,g2) %>%
summarise(l=length(intersect(DT[DT$Group==g1,]$ID,
DT[DT$Group==g2,]$ID)))
# g1 g2 l
# (fctr) (fctr) (int)
# 1 A B 2
# 2 A C 3
# 3 B C 3
yet another solution (base R):
tmp <- split(DT, DT[, 'Group'])
ans <- apply(combn(LETTERS[1 : 3], 2), 2, FUN = function(ind){
out <- length(intersect(tmp[[ind[1]]][, 1], tmp[[ind[2]]][, 1]))
c(group1 = ind[1], group2 = ind[2], sum_ = out)
}
)
data.frame(t(ans))
# group1 group2 sum_
#1 A B 2
#2 A C 3
#3 B C 3
first split data into list of groups, then for each unique pairwise combinations of two groups see how many subjects in common they have, using length(intersect(....

Resources