I have a data frame with same character in specific rows:
a 1
a 3
a 7
b 4
b 8
I want to changed it:
a.1 1
a.2 3
a.3 7
b.1 4
b.2 8
Do you know any code in R for this?
Thanks a lot.
You can also use data.table package:
library(data.table)
setDT(df)[,ix:=paste(V1,1:.N, sep='.'),V1][]
# V1 V2 ix
#1: a 1 a.1
#2: a 3 a.2
#3: a 7 a.3
#4: b 4 b.1
#5: b 8 b.2
Data:
df = structure(list(V1 = structure(c(1L, 1L, 1L, 2L, 2L), .Label = c("a",
"b"), class = "factor"), V2 = c(1L, 3L, 7L, 4L, 8L)), .Names = c("V1",
"V2"), class = "data.frame", row.names = c(NA, -5L))
In base R, you could do:
df$V1 <- with(df, paste(V1, ave(as.numeric(V1), V1, FUN = seq_along), sep="."))
print(df)
# V1 V2
#1 a.1 1
#2 a.2 3
#3 a.3 7
#4 b.1 4
#5 b.2 8
We can use dplyr/tidyr. We group by 'V1', create a sequence column ('VN'), unite the columns 'V1' and 'VN', and then rename the column.
library(dplyr)
library(tidyr)
df %>%
group_by(V1) %>%
mutate(VN = row_number()) %>%
unite(V1n, V1, VN, sep='.') %>%
rename(V1=V1n)
# V1 V2
# (chr) (int)
#1 a.1 1
#2 a.2 3
#3 a.3 7
#4 b.1 4
#5 b.2 8
Related
Suppose I have a data frame (df) like this:
Names ID Thing1 Thing2 Thing3 Thing4 Thing5
1: Gen1 id1 10 5 10 5 10
2: Gen2 id2 1 2 3 4 5
3: Gen1 id3 10 5 10 5 10
4: Gen2 id4 1 2 3 4 5
5: Gen3 id5 7 7 7 7 7
For each 'Names', I would like to sum 'Thing' columns, and collapse the strings in 'ID':
Names ID Thing1 Thing2 Thing3 Thing4 Thing5
1: Gen1 id1|id3 20 10 20 10 20
2: Gen2 id2|id4 2 4 6 8 10
3: Gen3 id5 7 7 7 7 7
I am able to achieve this via dplyr:
df1 <- df %>%
group_by(Names)%>%
summarise_each(funs(paste(unique(.), collapse='|')),matches('^\\D+$'))
df2 <- df %>%
group_by(Names)%>%
summarise_each(funs(sum = sum(., na.rm=TRUE)), starts_with('Thing' ))
bind_cols(df1, df2[-1])
However, this solution takes very long since I have a data frame with more than 10k rows and more than 10k column!
Is there any possible solution with data.table?
The closest I have gotten is this here:
> setDT(df)[, c(paste(df$ID,collapse = "-", sep = ""), lapply(.SD, sum, na.rm = TRUE)),
by = Names, .SDcols = !"ID"]
Names Thing1 Thing2 Thing3 Thing4 Thing5
1: Gen1 id1-id2-id3-id4-id5 20 10 20 10 20
2: Gen2 id1-id2-id3-id4-id5 2 4 6 8 10
3: Gen3 id1-id2-id3-id4-id5 7 7 7 7 7
Obviously this is not what I am going for since it will collapse all IDs and not just the ones that were aggregated by summarizing via "Names".
I would very much appreciate your help!
Here is the example data:
df <- structure(list(Names = c("Gen1", "Gen2", "Gen1", "Gen2","Gen3"),
ID=c("id1","id2","id3","id4","id5"),
Thing1 = c(10L, 1L, 10L, 1L, 7L),
Thing2 = c(5L, 2L, 5L, 2L,7L),
Thing3 = c(10L, 3L, 10L, 3L, 7L),
Thing4 = c(5L, 4L, 5L,4L, 7L),
Thing5 = c(10L, 5L, 10L, 5L, 7L)),
.Names = c("Names","ID","Thing1", "Thing2", "Thing3", "Thing4", "Thing5"),
class = "data.frame", row.names = c(1:5L))
If you don't heavily rely on data.table you could use aggregate two times and merge the results.
merge(aggregate(.~Names, df[-2], sum), aggregate(ID ~ Names, df, paste, collapse="|"))
# Names Thing1 Thing2 Thing3 Thing4 Thing5 ID
# 1 Gen1 20 10 20 10 20 id1|id3
# 2 Gen2 2 4 6 8 10 id2|id4
# 3 Gen3 7 7 7 7 7 id5
try it this way
use tidyverse
library(tidyverse)
df %>%
group_by(Names) %>%
summarise(across(where(is.character), str_c, collapse = "|"),
across(where(is.numeric), sum, na.rm = T))
# A tibble: 3 x 7
Names ID Thing1 Thing2 Thing3 Thing4 Thing5
<chr> <chr> <int> <int> <int> <int> <int>
1 Gen1 id1|id3 20 10 20 10 20
2 Gen2 id2|id4 2 4 6 8 10
3 Gen3 id5
use data.table
library(data.table)
dt <- copy(df)
setDT(dt)
out_sum <- dt[, lapply(.SD, sum), by = Names, .SDcols=!"ID"]
out_id <- dt[, list(id = sapply(list(ID), paste0, collapse = "|")), by = Names]
merge(out_id, out_sum)
Names id Thing1 Thing2 Thing3 Thing4 Thing5
1: Gen1 id1|id3 20 10 20 10 20
2: Gen2 id2|id4 2 4 6 8 10
3: Gen3 id5 7 7 7 7 7
My data.frame df looks like this:
A 1
A 2
A 5
B 2
B 3
B 4
C 3
C 7
C 9
I want it to look like this:
A B C
1 2 3
2 3 7
5 4 9
I have tried spread() but probably not in the right way. Any ideas?
We can use unstack from base R
unstack(df1, col2 ~ col1)
# A B C
#1 1 2 3
#2 2 3 7
#3 5 4 9
Or with split
data.frame(split(df1$col2, df1$col1))
Or if we use spread or pivot_wider, make sure to create a sequence column
library(dplyr)
library(tidyr)
df1 %>%
group_by(col1) %>%
mutate(rn = row_number()) %>%
ungroup %>%
pivot_wider(names_from = col1, values_from = col2) %>%
# or use
# spread(col1, col2) %>%
select(-rn)
# A tibble: 3 x 3
# A B C
# <int> <int> <int>
#1 1 2 3
#2 2 3 7
#3 5 4 9
Or using dcast
library(data.table)
dcast(setDT(df1), rowid(col1) ~ col1)[, .(A, B, C)]
data
df1 <- structure(list(col1 = c("A", "A", "A", "B", "B", "B", "C", "C",
"C"), col2 = c(1L, 2L, 5L, 2L, 3L, 4L, 3L, 7L, 9L)),
class = "data.frame", row.names = c(NA,
-9L))
In data.table, we can use dcast :
library(data.table)
dcast(setDT(df), rowid(col1)~col1, value.var = 'col2')[, col1 := NULL][]
# A B C
#1: 1 2 3
#2: 2 3 7
#3: 5 4 9
Suppose I have a data frame with categorical variable of n classes and a numerical variable. I need to randomize the numerical variable within each category. For example , consider the following table:
Col_1 Col_2
A 2
A 5
A 4
A 8
B 1
B 4
B 9
B 7
When I tried sample() function in R, it threw the result considering both the categories. Is there any function where I can get this kind of output? (with or without replacement, doesn't matter)
Col_1 Col_2
A 8
A 4
A 2
A 5
B 9
B 7
B 4
B 1
You could sample row numbers within groups. In base R, we can use ave
df[with(df, ave(seq_len(nrow(df)), Col_1, FUN = sample)), ]
# Col_1 Col_2
#2 A 5
#4 A 8
#1 A 2
#3 A 4
#7 B 9
#5 B 1
#8 B 7
#6 B 4
In dplyr, we can use sample_n
library(dplyr)
df %>% group_by(Col_1) %>% sample_n(n())
data
df <- structure(list(Col_1 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("A", "B"), class = "factor"), Col_2 = c(2L, 5L,
4L, 8L, 1L, 4L, 9L, 7L)), class = "data.frame", row.names = c(NA, -8L))
Here's a dplyr solution:
library(dplyr)
set.seed(2)
dat %>%
group_by(Col_1) %>%
mutate(Col_2 = sample(Col_2)) %>%
ungroup()
# # A tibble: 8 x 2
# Col_1 Col_2
# <chr> <int>
# 1 A 2
# 2 A 4
# 3 A 5
# 4 A 8
# 5 B 7
# 6 B 9
# 7 B 1
# 8 B 4
A data.table method:
library(data.table)
datDT <- as.data.table(dat)
set.seed(2)
datDT[, Col_2 := sample(Col_2), by = "Col_1"]
datDT
# Col_1 Col_2
# 1: A 2
# 2: A 4
# 3: A 5
# 4: A 8
# 5: B 7
# 6: B 9
# 7: B 1
# 8: B 4
Data
dat <- read.table(header = TRUE, stringsAsFactors = FALSE, text = "
Col_1 Col_2
A 2
A 5
A 4
A 8
B 1
B 4
B 9
B 7")
I have a dataframe like this
a b
1 A.1 1
2 A.2 2
3 A.3 1
5 B.1 2
6 B.2 2
7 B.3 1
I need to count for each letter (A and B here) the sum of the column b
a b
1 A 4
2 B 5
One option is using separate from tidyr to separate the column 'a' based on the delimiter ., group using the new 'a' and get the sum of 'b'.
library(tidyr)
library(dplyr)
separate(df1, a, into=c('a', 'a1')) %>%
group_by(a) %>%
summarise(b=sum(b))
# a b
#1 A 4
#2 B 5
Or we can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)). Use sub to remove the characters starting from ., followed by digits, use that as the grouping variable and get the sum of 'b'.
library(data.table)
setDT(df1)[,list(b=sum(b)) , by = .(a=sub('\\.\\d+$', '', a))]
# a b
#1: A 4
#2: B 5
Or a similar option using the formula method of aggregate from base R.
aggregate(b~cbind(a=sub('\\.\\d+$', '', a)), df1, FUN=sum)
# a b
# 1 A 4
# 2 B 5
Or using sqldf
library(sqldf)
sqldf('select substr(a, 1, instr(a, ".")-1) as a1,
sum(b) as b
from df1
group by a1')
# a1 b
#1 A 4
#2 B 5
data
df1 <- structure(list(a = c("A.1", "A.2", "A.3", "B.1", "B.2", "B.3"
), b = c(1L, 2L, 1L, 2L, 2L, 1L)), .Names = c("a", "b"),
class = "data.frame", row.names = c(NA, -6L))
let's have a two column data frame like this:
A 1
A 2
A 4
A 5
B 2
B 13
C 1
C 3
C 6
C 18
D 8
E 2
E 112
...
Is there a quick method in R how to transform it to such two columns dataframe?
A 1;2;4;5
B 2;13
C 1;3;6;18
D 8
E 2;112
And how to put it back to the first structure again?
Thank you
A base R option would be (comments from #David Arenburg)
res1 <- aggregate(Col2 ~ Col1, df1, paste, collapse = ";")
Or using data.table
library(data.table)
res2 <- setDT(df1)[, list(Col2=paste(Col2, collapse=";")), Col1]
Or with dplyr
library(dplyr)
res3 <- df1 %>%
group_by(Col1) %>%
summarise(Col2= paste(Col2, collapse=";") )
Update
To convert the output back to the original structure
library(splitstackshape)
cSplit(res2, 'Col2', ';', 'long')
data
df1 <- structure(list(Col1 = c("A", "A", "A", "A", "B", "B", "C", "C",
"C", "C", "D", "E", "E"), Col2 = c(1L, 2L, 4L, 5L, 2L, 13L, 1L,
3L, 6L, 18L, 8L, 2L, 112L)), .Names = c("Col1", "Col2"),
class = "data.frame", row.names = c(NA, -13L))
paste() with collapse = ";" is used in aggregate() to concatenate V2. To return it to the original structure, strsplit() is used to split V2 in lapply() - do.call() is just to bind the resulting list row-wise.
df <- read.table(header = F, text = "
A 1
A 2
A 4
A 5
B 2
B 13
C 1
C 3
C 6
C 18
D 8
E 2
E 112")
df1 <- aggregate(df, by = list(df$V1), FUN = function(x) paste(x, collapse = ";"))[,-2]
names(df1) <- c("V1", "V2")
df1
# V1 V2
#1 A 1;2;4;5
#2 B 2;13
#3 C 1;3;6;18
#4 D 8
#5 E 2;112
df <- do.call(rbind, lapply(unique(df1$V1), function(x) {
df <- data.frame(x, strsplit(df1[df1$V1 == x, 2], ";"))
names(df) <- c("V1", "V2")
df
}))
df
# V1 V2
#1 A 1
#2 A 2
#3 A 4
#4 A 5
#5 B 2
#6 B 13
#7 C 1
#8 C 3
#9 C 6
#10 C 18
#11 D 8
#12 E 2
#13 E 112