Related
I have two data-frames
The first data
col1 col2 col3
A1 4 11 15
A2 2 9 17
A3 3 4 4
B1 10 5 4
B2 6 1 8
C1 12 1 12
C2 2 5 8
D1 4 1 6
D2 2 1 8
The second data
meancol1 meancol2 meancol3
meanA 3 8 12
meanB 8 3 6
meanC 7 3 10
meanD 3 1 7
I want to combine the two data-frames and keep the colnames of the first dataset
so the result i want is :
col1 col2 col3
A1 4 11 15
A2 2 9 17
A3 3 4 4
B1 10 5 4
B2 6 1 8
C1 12 1 12
C2 2 5 8
D1 4 1 6
D2 2 1 8
meanA 3 8 12
meanB 8 3 6
meanC 7 3 10
meanD 3 1 7
I tried : the following function
data_all <- rbind(df1,df2)
but it didn't work
I also tried the function bind_rows from dplyr package but this one create new columns.
Thank you
You could always do:
colnames(df2) <- colnames(df1)
data_all <- rbind(df1, df2)
data_all
A possible solution:
library(tidyverse)
df1 <- read.table(text = " col1 col2 col3
A1 4 11 15
A2 2 9 17
A3 3 4 4
B1 10 5 4
B2 6 1 8
C1 12 1 12
C2 2 5 8
D1 4 1 6
D2 2 1 8
", header=T)
df2 <- read.table(text = "meancol1 meancol2 meancol3
meanA 3 8 12
meanB 8 3 6
meanC 7 3 10
meanD 3 1 7
", header=T)
df2 %>% rename_with(~ str_remove(.x, "mean")) %>%
bind_rows(df1, .)
#> col1 col2 col3
#> A1 4 11 15
#> A2 2 9 17
#> A3 3 4 4
#> B1 10 5 4
#> B2 6 1 8
#> C1 12 1 12
#> C2 2 5 8
#> D1 4 1 6
#> D2 2 1 8
#> meanA 3 8 12
#> meanB 8 3 6
#> meanC 7 3 10
#> meanD 3 1 7
You could do:
rbind(df1, setNames(df2, names(df1)))
col1 col2 col3
A1 4 11 15
A2 2 9 17
A3 3 4 4
B1 10 5 4
B2 6 1 8
C1 12 1 12
C2 2 5 8
D1 4 1 6
D2 2 1 8
meanA 3 8 12
meanB 8 3 6
meanC 7 3 10
meanD 3 1 7
Use mapply:
data.frame(mapply(c, df1, df2))
output
id col1 col2 col3
1 A1 4 11 15
2 A2 2 9 17
3 A3 3 4 4
4 B1 10 5 4
5 B2 6 1 8
6 C1 12 1 12
7 C2 2 5 8
8 D1 4 1 6
9 D2 2 1 8
10 meanA 3 8 12
11 meanB 8 3 6
12 meanC 7 3 10
13 meanD 3 1 7
data
df1 <- read.table(text = "id col1 col2 col3
A1 4 11 15
A2 2 9 17
A3 3 4 4
B1 10 5 4
B2 6 1 8
C1 12 1 12
C2 2 5 8
D1 4 1 6
D2 2 1 8
", header=T)
df2 <- read.table(text = "id meancol1 meancol2 meancol3
meanA 3 8 12
meanB 8 3 6
meanC 7 3 10
meanD 3 1 7
", header=T)
Another option is to use rbindlist:
library(data.table)
rbindlist(lapply(list(df1, df2), setDT, keep.rownames = TRUE))
# Or using `add_rownames` from `dplyr`
# rbindlist(lapply(list(df1, df2), add_rownames), use.names = F)
Output
rn col1 col2 col3
1: A1 4 11 15
2: A2 2 9 17
3: A3 3 4 4
4: B1 10 5 4
5: B2 6 1 8
6: C1 12 1 12
7: C2 2 5 8
8: D1 4 1 6
9: D2 2 1 8
10: meanA 3 8 12
11: meanB 8 3 6
12: meanC 7 3 10
13: meanD 3 1 7
Data
df1 <- structure(list(col1 = c(4L, 2L, 3L, 10L, 6L, 12L, 2L, 4L, 2L),
col2 = c(11L, 9L, 4L, 5L, 1L, 1L, 5L, 1L, 1L), col3 = c(15L,
17L, 4L, 4L, 8L, 12L, 8L, 6L, 8L)), class = "data.frame", row.names = c("A1",
"A2", "A3", "B1", "B2", "C1", "C2", "D1", "D2"))
df2 <- structure(list(meancol1 = c(3L, 8L, 7L, 3L), meancol2 = c(8L,
3L, 3L, 1L), meancol3 = c(12L, 6L, 10L, 7L)), class = "data.frame", row.names = c("meanA",
"meanB", "meanC", "meanD"))
my issue is I have a big database of 283 columns, some of which have the same name (for example, "uncultured").
Is there a way to select columns avoiding those with repeated names? Those (bacteria) normally have a very small abundance, so I don't really care for their contribution, I'd just like to take the columns with unique names.
My database is something like
Samples col1 col2 col3 col4 col2 col1....
S1
S2
S3
...
and I'd like to select every column but the second col2 and col1.
Thanks!
Something like this should work:
df[, !duplicated(colnames(df))]
Like this you will automatically select the first column with a unique name:
df[unique(colnames(df))]
#> col1 col2 col3 col4 S1 S2 S3
#> 1 1 2 3 4 7 8 9
#> 2 1 2 3 4 7 8 9
#> 3 1 2 3 4 7 8 9
#> 4 1 2 3 4 7 8 9
#> 5 1 2 3 4 7 8 9
Reproducible example
df is defined as:
df <- as.data.frame(matrix(rep(1:9, 5), ncol = 9, byrow = TRUE))
colnames(df) <- c("col1", "col2", "col3", "col4", "col2", "col1", "S1", "S2", "S3")
df
#> col1 col2 col3 col4 col2 col1 S1 S2 S3
#> 1 1 2 3 4 5 6 7 8 9
#> 2 1 2 3 4 5 6 7 8 9
#> 3 1 2 3 4 5 6 7 8 9
#> 4 1 2 3 4 5 6 7 8 9
#> 5 1 2 3 4 5 6 7 8 9
I would like to expand a data frame in the following way using:
GX will be substituted by three different variables Gs = (G4, G5, G6) what will all keep the attribute values of GX. For each new Gs the current GX rows must be duplicated and the GX name substituted by the corresponding Gs name.
set.seed(123)
df = data.frame(
"id" = c(rep("G1", 3), rep("G2", 3), rep("G3", 3), rep("GX",3)),
"subgroup" = rep(c(1,2,3), 4),
"total" = sample.int(n = 12),
"C1" = sample.int(n=12),
"C2" = sample.int(n=12),
"C3" = sample.int(n=12))
id subgroup total C1 C2 C3
1 G1 1 3 11 9 9
2 G1 2 12 5 3 12
3 G1 3 10 3 4 10
4 G2 1 2 9 1 7
5 G2 2 6 4 11 3
6 G2 3 11 1 7 4
7 G3 1 5 7 5 5
8 G3 2 4 12 10 6
9 G3 3 9 10 8 8
10 GX 1 8 2 2 2
11 GX 2 1 6 12 1
12 GX 3 7 8 6 11
I have one solution that includes a for loop:
Gs = c("G4", "G5", "G6")
for (ii in 1:length(Gs)) {
tmp.df <- df[df$id == "GX",]
tmp.df$id <- gsub(
pattern = "GX",
replacement = Gs[ii],
x = tmp.df$id
)
df <- rbind(df, tmp.df)
}
df = df[df$id != "GX",]
Which gives:
id subgroup total C1 C2 C3
1 G1 1 3 11 9 9
2 G1 2 12 5 3 12
3 G1 3 10 3 4 10
4 G2 1 2 9 1 7
5 G2 2 6 4 11 3
6 G2 3 11 1 7 4
7 G3 1 5 7 5 5
8 G3 2 4 12 10 6
9 G3 3 9 10 8 8
101 G4 1 8 2 2 2
111 G4 2 1 6 12 1
121 G4 3 7 8 6 11
102 G5 1 8 2 2 2
112 G5 2 1 6 12 1
122 G5 3 7 8 6 11
103 G6 1 8 2 2 2
113 G6 2 1 6 12 1
123 G6 3 7 8 6 11
However I would like to include the solution in a pipe and avoid the for loop solution. Is there any more R stylish approach using tidyverse syntax to be included in a pipe?
Thanks
We can select only rows where id == 'GX', create all it's combinations with Gs and bind the original dataframe to it removing the 'Gx' rows.
library(dplyr)
df %>%
filter(id == 'GX') %>%
tidyr::crossing(Gs) %>%
select(-id, id = Gs) %>%
bind_rows(df, .) %>%
filter(id != 'GX')
# id subgroup total C1 C2 C3
#1 G1 1 3 11 9 9
#2 G1 2 12 5 3 12
#3 G1 3 10 3 4 10
#4 G2 1 2 9 1 7
#5 G2 2 6 4 11 3
#6 G2 3 11 1 7 4
#7 G3 1 5 7 5 5
#8 G3 2 4 12 10 6
#9 G3 3 9 10 8 8
#10 G4 1 8 2 2 2
#11 G5 1 8 2 2 2
#12 G6 1 8 2 2 2
#13 G4 2 1 6 12 1
#14 G5 2 1 6 12 1
#15 G6 2 1 6 12 1
#16 G4 3 7 8 6 11
#17 G5 3 7 8 6 11
#18 G6 3 7 8 6 11
This question was previously answered by Ronak Shah here. Copying what he did there:
library(dplyr)
library(tidyr)
set.seed(123)
df = data.frame(
"id" = c(rep("G1", 3), rep("G2", 3), rep("G3", 3), rep("GX",3)),
"subgroup" = rep(c(1,2,3), 4),
"total" = sample.int(n = 12),
"C1" = sample.int(n=12),
"C2" = sample.int(n=12),
"C3" = sample.int(n=12))
df2 <- df %>%
filter(id == "GX") %>%
group_by(id) %>%
slice(rep(seq_len(n()), n()))
Gives you:
> df2
# A tibble: 9 x 6
# Groups: id [1]
id subgroup total C1 C2 C3
<fct> <dbl> <int> <int> <int> <int>
1 GX 1 8 2 2 2
2 GX 2 1 6 12 1
3 GX 3 7 8 6 11
4 GX 1 8 2 2 2
5 GX 2 1 6 12 1
6 GX 3 7 8 6 11
7 GX 1 8 2 2 2
8 GX 2 1 6 12 1
9 GX 3 7 8 6 11
Which is very close to what you are after.
I have a dataframe (df) like this:
Rif dd A A A A A B B B B B C C C C C
a1 10 5 8 10 2 6 9 6 5 7 9 1 5 6 4 5
b1 20 12 7 1 5 9 10 5 3 8 7 3 6 1 9 8
c1 100 11 6 8 1 14 1 11 9 3 6 10 8 13 8 4
d1 70 4 3 7 8 11 19 2 6 7 1 20 18 7 10 7
I have a vector
rif <- c(0, 15, 50, 90, 110)
I would like to add to the df a column such that if dd(i) >= rif(i-1) & dd(i)
Rif dd A A A A A B B B B B C C C C C V1
a1 10 5 8 10 2 6 9 6 5 7 9 1 5 6 4 5 8
b1 20 12 7 1 5 9 10 5 3 8 7 3 6 1 9 8 1
c1 100 1 6 8 1 14 1 11 9 3 6 10 8 13 8 4 14
d1 70 4 3 7 8 11 19 2 6 7 1 20 18 7 10 7 8
The same should be done for V2 and V3 with respect to Bs and Cs columns.
ref <- c(0, 15, 50, 90, 110)
for (i in 2:length(ref)) {
for (j in 1:nrow(df)) {
if (df$dd >= ref[i-1] && df$dd< ref[i]) {
df[,"V1"] <- df[j,i]
}
}
}
I get the following error:
Error in if (..) :
missing value where TRUE/FALSE needed
Probably the if command is not the correct one.
could you help me?
I think you just need to better specify the rows and columns:
df <- data.frame(
c("a1","b1","c1","d1")
, c(10,20,100,70), c(5,12,11,4), c(8,7,6,3), c(10,1,8,7), c(2,5,1,8), c(6,9,14,11)
, c(9,10,1,19), c(6,5,11,2), c(5,3,9,6), c(7,8,3,7), c(9,7,6,1)
, c(1,3,10,20), c(5,6,8,18), c(6,1,13,7), c(4,9,8,10), c(5,8,4,7)
)
colnames(df) <- c("Rif", "dd", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "C", "C", "C", "C", "C")
ref <- c(0, 15, 50, 90, 110)
for (i in 2:length(ref)) {
for (j in 1:nrow(df)) {
if (df$dd[j] >= ref[i-1] && df$dd[j] < ref[i]) {
df$V1[j] <- df[j,i+2]
df$V2[j] <- df[j,i+2+5]
df$V3[j] <- df[j,i+2+10]
}
}
}
which gives:
Rif dd A A A A A B B B B B C C C C C V1 V2 V3
1 a1 10 5 8 10 2 6 9 6 5 7 9 1 5 6 4 5 8 6 5
2 b1 20 12 7 1 5 9 10 5 3 8 7 3 6 1 9 8 1 3 1
3 c1 100 11 6 8 1 14 1 11 9 3 6 10 8 13 8 4 14 6 4
4 d1 70 4 3 7 8 11 19 2 6 7 1 20 18 7 10 7 8 7 10
Another option in base R:
lters <- c(A="A", B="B", C="C")
firstcol <- lapply(lters, function(x) match(x, colnames(DF)))
idx <- findInterval(DF$dd, rif)
for (l in lters)
DF[, paste0("V_", l)] <- as.integer(DF[cbind(seq_len(nrow(DF)), idx + firstcol[[l]])])
DF
output:
Rif dd A A.1 A.2 A.3 A.4 B B.1 B.2 B.3 B.4 C C.1 C.2 C.3 C.4 V_A V_B V_C
1 a1 10 5 8 10 2 6 9 6 5 7 9 1 5 6 4 5 8 6 5
2 b1 20 12 7 1 5 9 10 5 3 8 7 3 6 1 9 8 1 3 1
3 c1 100 11 6 8 1 14 1 11 9 3 6 10 8 13 8 4 14 6 4
4 d1 70 4 3 7 8 11 19 2 6 7 1 20 18 7 10 7 8 7 10
data:
DF <- structure(list(Rif = c("a1", "b1", "c1", "d1"), dd = c(10L, 20L,
100L, 70L), A = c(5L, 12L, 11L, 4L), A = c(8L, 7L, 6L, 3L), A = c(10L,
1L, 8L, 7L), A = c(2L, 5L, 1L, 8L), A = c(6L, 9L, 14L, 11L),
B = c(9L, 10L, 1L, 19L), B = c(6L, 5L, 11L, 2L), B = c(5L,
3L, 9L, 6L), B = c(7L, 8L, 3L, 7L), B = c(9L, 7L, 6L, 1L),
C = c(1L, 3L, 10L, 20L), C = c(5L, 6L, 8L, 18L), C = c(6L,
1L, 13L, 7L), C = c(4L, 9L, 8L, 10L), C = c(5L, 8L, 4L, 7L
)), class = "data.frame", row.names = c(NA, -4L))
rif <- c(0, 15, 50, 90, 110)
Another way is reorganize the data by separating the lookup values into another table and perform an update join using data.table:
library(data.table)
setDT(DF)
out <- DF[, .(rn=.I, Rif, dd)]
#reorganizing data
lc <- grepl("A|B|C", names(DF))
lutbl <- data.table(COL=names(DF)[lc], transpose(DF[, ..lc]))
lutbl <- melt(lutbl, measure.vars=patterns("V"), variable.name="rn")[,
c("rn", "rif") := .(as.integer(gsub("V", "", rn)), rep(rif, sum(lc)*nrow(DF)/length(rif)))]
#lookup and update
for (l in lters)
out[, paste0("NEW", l) := lutbl[COL==l][out, on=c("rn", "rif"="dd"), roll=-Inf, value]]
out:
rn Rif dd NEWA NEWB NEWC
1: 1 a1 10 8 6 5
2: 2 b1 20 1 3 1
3: 3 c1 100 14 6 4
4: 4 d1 70 8 7 10
I'm using tidyverse but a base solution is welcome, too.
Is there a way to, without transposing, gather a dataframe but instead of the key being the column names, the key is stored in a row. For example, let's say I have a tibble called df.
df <- tibble(a = c(5,3,5,6,2,"G1"),
b = c(5,3,5,6,2,"G1"),
c = c(8,2,6,4,1,"G2"),
d = c(8,2,6,4,1,"G2"),
e = c(9,3,7,8,4,"G3"),
f = c(9,3,7,8,4,"G3"),
g = c(6,5,2,1,8,"G4"),
h = c(6,5,2,1,8,"G4"))
df
# A tibble: 6 x 8
a b c d e f g h
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 5 5 8 8 9 9 6 6
2 3 3 2 2 3 3 5 5
3 5 5 6 6 7 7 2 2
4 6 6 4 4 8 8 1 1
5 2 2 1 1 4 4 8 8
6 G1 G1 G2 G2 G3 G3 G4 G4
The groups to group by or gather on is in the bottom row. Is there a way to get df to have three columns only, such that the columns c, e, and g are gathered into column a, columns d, f, and h are gathered into column b and row 6 becomes column c? The result would look like:
tibble(a = c(5,3,5,6,2,8,2,6,4,1,9,3,7,8,4,6,5,2,1,8),
b = c(5,3,5,6,2,8,2,6,4,1,9,3,7,8,4,6,5,2,1,8),
c = c("G1","G1","G1","G1","G1","G2","G2","G2","G2","G2",
"G3","G3","G3","G3","G3","G4","G4","G4","G4","G4"))
# A tibble: 20 x 3
a b c
<dbl> <dbl> <chr>
1 5 5 G1
2 3 3 G1
3 5 5 G1
4 6 6 G1
5 2 2 G1
6 8 8 G2
7 2 2 G2
8 6 6 G2
9 4 4 G2
10 1 1 G2
11 9 9 G3
12 3 3 G3
13 7 7 G3
14 8 8 G3
15 4 4 G3
16 6 6 G4
17 5 5 G4
18 2 2 G4
19 1 1 G4
20 8 8 G4
I would like to avoid transposing because I need the row and column orders preserved until everything is properly labeled.
Here is one idea.
library(tidyverse)
df2 <- df %>%
t() %>%
as.data.frame(stringsAsFactors = FALSE) %>%
split(f = .$V6) %>%
map_dfr(~.x %>%
select(-V6) %>%
t() %>%
as.data.frame(stringsAsFactors = FALSE) %>%
setNames(c("a", "b")),
.id = "c") %>%
select(a, b, c) %>%
mutate_at(vars(-c), list(~as.numeric(.)))
df2
# a b c
# 1 5 5 G1
# 2 3 3 G1
# 3 5 5 G1
# 4 6 6 G1
# 5 2 2 G1
# 6 8 8 G2
# 7 2 2 G2
# 8 6 6 G2
# 9 4 4 G2
# 10 1 1 G2
# 11 9 9 G3
# 12 3 3 G3
# 13 7 7 G3
# 14 8 8 G3
# 15 4 4 G3
# 16 6 6 G4
# 17 5 5 G4
# 18 2 2 G4
# 19 1 1 G4
# 20 8 8 G4
Here is one implementation. We can split the tibble into a list of tibble based on the last row, loop through the list with imap, rename the colums to same column names ('a', 'b'), mutate to create the column 'c' with the list name and bind the rows
library(tidyverse)
df %>%
slice(-n()) %>%
split.default(df %>%
slice(n()) %>%
flatten_chr) %>%
imap_dfr(~ .x %>%
rename_all(~ c('a', 'b')) %>%
mutate(c = .y))
# A tibble: 20 x 3
# a b c
# <chr> <chr> <chr>
# 1 5 5 G1
# 2 3 3 G1
# 3 5 5 G1
# 4 6 6 G1
# 5 2 2 G1
# 6 8 8 G2
# 7 2 2 G2
# 8 6 6 G2
# 9 4 4 G2
#10 1 1 G2
#11 9 9 G3
#12 3 3 G3
#13 7 7 G3
#14 8 8 G3
#15 4 4 G3
#16 6 6 G4
#17 5 5 G4
#18 2 2 G4
#19 1 1 G4
#20 8 8 G4
Transposing probably doesn't hurt if you do it step by step. In this base R solution, row and column information is kept until the last line.
d <- data.frame(t(as.matrix(df)))
l <- lapply(split(d[-6], d$X6), t)
res <- do.call(rbind, Map(cbind, l, c=names(l)))
res <- setNames(data.frame(res, row.names=NULL), letters[1:3])
res
# a b c
# 1 5 5 G1
# 2 3 3 G1
# 3 5 5 G1
# 4 6 6 G1
# 5 2 2 G1
# 6 8 8 G2
# 7 2 2 G2
# 8 6 6 G2
# 9 4 4 G2
# 10 1 1 G2
# 11 9 9 G3
# 12 3 3 G3
# 13 7 7 G3
# 14 8 8 G3
# 15 4 4 G3
# 16 6 6 G4
# 17 5 5 G4
# 18 2 2 G4
# 19 1 1 G4
# 20 8 8 G4
One option with data.table
First, since we're not using the original names, replace them. Also remove the last row and convert everthing to integer.
library(data.table)
setDT(df)
df <- df[-.N]
df[, names(df) := lapply(.SD, as.integer)]
setnames(df, rep_len(c('a', 'b'), ncol(df)))
# a b a b a b a b
# 1: 5 5 8 8 9 9 6 6
# 2: 3 3 2 2 3 3 5 5
# 3: 5 5 6 6 7 7 2 2
# 4: 6 6 4 4 8 8 1 1
# 5: 2 2 1 1 4 4 8 8
Now melt on the row number, add the G[1-4] column, and dcast melted df to wide form.
df[, rid := 1:.N]
df2 <- melt(df, 'rid')
df2[, c := paste0('G', rowid(rid, variable))]
dcast(df2, rid + c ~ variable)[order(c), -'rid']
# c a b
# 1: G1 5 5
# 2: G1 3 3
# 3: G1 5 5
# 4: G1 6 6
# 5: G1 2 2
# 6: G2 8 8
# 7: G2 2 2
# 8: G2 6 6
# 9: G2 4 4
# 10: G2 1 1
# 11: G3 9 9
# 12: G3 3 3
# 13: G3 7 7
# 14: G3 8 8
# 15: G3 4 4
# 16: G4 6 6
# 17: G4 5 5
# 18: G4 2 2
# 19: G4 1 1
# 20: G4 8 8