data_1 <- data.frame(V1 = c("123","345","546","890"), V2 = c("J10","K12","R34","J17"),V3=c("N12","M34","W57","Q90"))
data_1
| V1 | V2 | V3 |
|:---- |:------:| -----:|
| 123 | J10 | N12 |
| 345 | K12 | M34 |
| 546 | N12 | R34 |
| 890 | J17 | J10 |
data_2 <- data.frame(V1 = c("123","345","546","890"), V2 = c("01/02/90","10/04/21","09/03/95","29/03/90"),V3=c("28/07/86","16/02/87","17/10/56","14/01/60"))
data_2
| V1 | V2 | V3 |
|:---- |:------:| -----:|
| 123 | 01/02/90 | 28/07/86 |
| 345 | 10/04/21 | 16/02/87 |
| 546 | 09/03/95 | 17/10/56 |
| 890 | 29/03/90 | 14/01/60 |
I would like to have a common first column and collapse the data into a array structure
Result:
| V1 | J10 | N12 | K12 | M34 | R34 | J17 |
|:---- |:----:| :----:| :----: | :----: | :----: | ----:|
| 123 | 01/02/90 | 28/07/86 || | | |
| 345 | | |10/04/21|16/02/87 | | |
| 546 | | 09/03/95 || |17/10/56 | |
| 890 |14/01/60 | || | | 29/03/90 |
We may reshape to 'long' format, bind the datasets and then reshape back to 'wide'
library(dplyr)
library(tidyr)
bind_cols(data_1 %>%
pivot_longer(cols = -V1),
data_2 %>%
pivot_longer(cols = -V1) %>%
select(-V1)) %>%
select(-starts_with('name')) %>%
pivot_wider(names_from = value...3, values_from = value...5)
-output
# A tibble: 4 × 9
V1 J10 N12 K12 M34 R34 W57 J17 Q90
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 123 01/02/90 28/07/86 <NA> <NA> <NA> <NA> <NA> <NA>
2 345 <NA> <NA> 10/04/21 16/02/87 <NA> <NA> <NA> <NA>
3 546 <NA> <NA> <NA> <NA> 09/03/95 17/10/56 <NA> <NA>
4 890 <NA> <NA> <NA> <NA> <NA> <NA> 29/03/90 14/01/60
data_1 <- data.frame(V1 = c("123","345","546","890"), V2 = c("J10","K12","R34","J17"),V3=c("N12","M34","W57","Q90"))
data_2 <- data.frame(V1 = c("123","345","546","890"), V2 = c("01/02/90","10/04/21","09/03/95","29/03/90"),V3=c("28/07/86","16/02/87","17/10/56","14/01/60"))
var_1 <- data.frame( V1= data_1$V1, VAR = data_1$V2, stringsAsFactors = F)
var_2 <- data.frame( V1= data_1$V1, VAR = data_1$V3, stringsAsFactors = F)
var <- bind_rows(var_1,var_2)
date_1 <- data.frame( V1= data_2$V1, DATE = data_2$V2, stringsAsFactors = F)
date_2 <- data.frame( V1= data_2$V1, DATE = data_2$V2, stringsAsFactors = F)
date <- bind_rows(date_1,date_2)
result <- left_join(var, date) %>% mutate_all(as.character) %>% distinct()
result <- result %>% pivot_wider(names_from = VAR, values_from = DATE)
result
V1 J10 K12 R34 J17 N12 M34 W57 Q90
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 123 01/02/90 NA NA NA 01/02/90 NA NA NA
2 345 NA 10/04/21 NA NA NA 10/04/21 NA NA
3 546 NA NA 09/03/95 NA NA NA 09/03/95 NA
4 890 NA NA NA 29/03/90 NA NA NA 29/03/90
Related
I need to add 2 rows to a dataframe that have the same values as existing rows. For example, below I would need to add "a" = 3 with the same "b" values as "a" = 2, going from this:
| a | b |
| --| ------|
| 1 | higha |
| 1 | lowa |
| 2 | highb |
| 2 | lowb |
to this:
| a | b |
| --| ------|
| 1 | higha |
| 1 | lowa |
| 2 | highb |
| 2 | lowb |
| 3 | highb |
| 3 | lowb |
A one-liner in base R would be:
`rownames<-`(rbind(df, within(df[df$a == 2,], a <- 3)), NULL)
#> a b
#> 1 1 higha
#> 2 1 lowa
#> 3 2 highb
#> 4 2 lowb
#> 5 3 highb
#> 6 3 lowb
We may use
library(dplyr)
library(tidyr)
df %>%
uncount((a == 2)+1) %>%
mutate(a = replace(a, duplicated(b) & a == 2, 3)) %>%
arrange(a)
-output
# A tibble: 6 × 2
a b
<dbl> <chr>
1 1 higha
2 1 lowa
3 2 highb
4 2 lowb
5 3 highb
6 3 lowb
Or with base R
i1 <- df$a == 2
df[nrow(df) + seq_len(sum(i1)),] <- data.frame(a = 3, b = df$b[i1])
data
df <- data.frame(a = rep(1:2, each = 2),
b = c("higha", "lowa", "highb", "lowb"))
This question already has answers here:
How to reshape data from long to wide format
(14 answers)
Closed 5 months ago.
I have a dataset that looks like this:
ID | age | disease
smith192 | 17 | lung_cancer
green484 | 12 | diabetes
green484 | 13 | heart_irregularities
tom584 | 12 | colon_cancer
tom584 | 14 | diabetes
tom584 | 15 | malnutrition
And I would like R to organize it into this:
ID | age_1 | disease_1 | age_2 | disease_2 | age_3 | disease_3 |
smith192 | 17 | lung_cancer | NA | NA | NA | NA |
green484 | 12 | diabetes | 13 | heart_irregularities | NA | NA |
tom584 | 12 | colon_cancer | 14 | diabetes | 15 | malnutrition |
Any help would be greatly appreciated!
You could create disease indices for each ID and then pivot the data to wide.
base
df |>
transform(n = ave(ID, ID, FUN = seq)) |>
reshape(direction = "wide", idvar = "ID", timevar = "n", v.names = c("age", "disease"))
# ID age.1 disease.1 age.2 disease.2 age.3 disease.3
# 1 smith192 17 lung_cancer NA <NA> NA <NA>
# 2 green484 12 diabetes 13 heart_irregularities NA <NA>
# 4 tom584 12 colon_cancer 14 diabetes 15 malnutrition
tidyverse
library(dplyr)
library(tidyr)
df %>%
group_by(ID) %>%
mutate(n = 1:n()) %>%
ungroup() %>%
pivot_wider(ID, names_from = n, values_from = c(age, disease))
# # A tibble: 3 × 7
# ID age_1 age_2 age_3 disease_1 disease_2 disease_3
# <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr>
# 1 smith192 17 NA NA lung_cancer NA NA
# 2 green484 12 13 NA diabetes heart_irregularities NA
# 3 tom584 12 14 15 colon_cancer diabetes malnutrition
Data
df <- structure(list(ID = c("smith192", "green484", "green484", "tom584",
"tom584", "tom584"), age = c(17, 12, 13, 12, 14, 15), disease = c("lung_cancer",
"diabetes", "heart_irregularities", "colon_cancer", "diabetes",
"malnutrition")), class = "data.frame", row.names = c(NA, -6L))
I have two data frames
df1 is like this
| NOC | 2007 | 2008 |
|:---- |:------:| -----:|
| A | 100 | 5 |
| B | 100 | 5 |
| C | 100 | 5|
| D | 20 | 2 |
| E | 10 | 12 |
| F | 2 | 1 |
df2
| NOC | GROUP |
|:---- |:------:|
| A | aa|
| B | aa |
| C | aa |
| D | bb |
| E | bb |
| F | cc |
I would like to create a new df3 which will aggregate the columns 2007 and 2008 based on Group identity by assigning the sum of rows with the same group identity, so my df3 would look like this
NOC
2007
2008
GROUP
S2007
s2008
A
100
5
aa
300
15
B
100
5
aa
300
15
C
100
5
aa
300
15
D
20
2
bb
30
14
E
10
12
bb
30
14
F
2
1
cc
2
1
my codes are not very efficient, I first merged df1 with df2 by NOC, into df3
df3<-merge(df1, df2, by="NOC",all.x=TRUE)
then used dprl summarised into df4 and created s2007 and s2008
df3 %>%
group_by(GROUP) %>%
summarise(num = n(),
s2017 = sum(2007),s2018 = sum(2008))->df3
then I merged df1 with df3 again to create my final database
I am wondering two problems:
is there a more efficient way?
since my dataframe contains annual data 2007-2030, currently I am writing out the summarize function for each year, is there a faster way of summarize all the columns except NOC?
Thank you!
Before this, a small piece of advice, never name your columns in numeric, it may create you many glitches.
library(tidyverse)
df1 %>% left_join(df2, by = 'NOC') %>%
group_by(GROUP) %>%
mutate(across(c(`2007`, `2008`), ~sum(.), .names = 's.{.col}' ))
# A tibble: 6 x 6
# Groups: GROUP [3]
NOC `2007` `2008` GROUP s.2007 s.2008
<chr> <int> <int> <chr> <int> <int>
1 A 100 5 aa 300 15
2 B 100 5 aa 300 15
3 C 100 5 aa 300 15
4 D 20 2 bb 30 14
5 E 10 12 bb 30 14
6 F 2 1 cc 2 1
at the moment I'm trying to figure out how to keep the names of an inner and other list nested within a tibble while unnesting.
The .id parameter of the unnest function is the closest I found, but it starts to number the values instead of using the given names.
here is a MWE with my idea of the final tibble:
library(dplyr)
library(tidyr)
df.1 <- tibble(
x = list("Foo","Bar"),
y = list(
list(a = list(aa = 1, ab = 2), b = list(ba = 6, bb = 22)),
list(c = list(ca = 561, cb = 35), d = list(da = 346, db = 17))
)
)
df.2 <- unnest(df.1, .id = "name.outher")
df.3 <- unnest(df.2, .id = "name.inner")
# How do I get from this:
#
#-----------------------------------------------------------------------
# x | y |
#-----+----------------------------------------------------------------+
# Foo | list(a = list(aa = 1, ab = 2), b = list(ba = 6, bb = 22)) |
#-----+----------------------------------------------------------------+
# Bar | list(c = list(ca = 561, cb = 35), d = list(da = 346, db = 17)) |
#-----------------------------------------------------------------------
#
# to this:
#
#---------------------------------------
# x | name.outher | y | name.inner |
#-----+-------------+-----+------------+
# Foo | a | 1 | aa |
#-----+-------------+-----+------------+
# Foo | a | 2 | ab |
#-----+-------------+-----+------------+
# Foo | b | 6 | ba |
#-----+-------------+-----+------------+
# Foo | b | 22 | bb |
#-----+-------------+-----+------------+
# Bar | c | 561 | ca |
#-----+-------------+-----+------------+
# Bar | c | 35 | cb |
#-----+-------------+-----+------------+
# Bar | d | 346 | da |
#-----+-------------+-----+------------+
# Bar | d | 17 | db |
#-------------------------------------
#
# instead of this:
#
#---------------------------------------
# x | name.outher | y | name.inner |
#-----+-------------+-----+------------+
# Foo | 1 | 1 | 1 |
#-----+-------------+-----+------------+
# Foo | 1 | 2 | 1 |
#-----+-------------+-----+------------+
# Foo | 1 | 6 | 2 |
#-----+-------------+-----+------------+
# Foo | 1 | 22 | 2 |
#-----+-------------+-----+------------+
# Bar | 2 | 561 | 3 |
#-----+-------------+-----+------------+
# Bar | 2 | 35 | 3 |
#-----+-------------+-----+------------+
# Bar | 2 | 346 | 4 |
#-----+-------------+-----+------------+
# Bar | 2 | 17 | 4 |
#---------------------------------------
Do you have any idea how i can preserve the names while unnesting this data structure?
We can melt
library(reshape2)
library(dplyr)
df.1 %>%
.$y %>%
melt %>%
select(x = L1, name.outher = L2, y = value, name.inner = L3)
# x name.outher y name.inner
#1 1 a 1 aa
#2 1 a 2 ab
#3 1 b 6 ba
#4 1 b 22 bb
#5 2 c 561 ca
#6 2 c 35 cb
#7 2 d 346 da
#8 2 d 17 db
Or use map and as_tibble
library(tidyverse)
df.1 %>%
pull(y) %>%
map_df(~ as_tibble(.x) %>%
map_df(~as_tibble(.x) %>%
gather(name.inner, y), .id = 'name.outer'),
.id = 'x')
# A tibble: 8 x 4
# x name.outer name.inner y
# <chr> <chr> <chr> <dbl>
#1 1 a aa 1
#2 1 a ab 2
#3 1 b ba 6
#4 1 b bb 22
#5 2 c ca 561
#6 2 c cb 35
#7 2 d da 346
#8 2 d db 17
I want to spread name column.
d <- data.frame(ID = c(1,1,2,2,2,3,3),
name = c("a", "b", "a", "c", "d","c","d"))
| ID | name |
|-----|------|
| 1 | a |
| 1 | b |
| 2 | a |
| 2 | c |
| 2 | d |
| 3 | c |
| 3 | d |
using tidyr::spread() can get like under the data.frame
d %>% tidyr::spread(name,name)
| ID| a | b | c | d |
| 1 | a | b | NA| NA|
| 2 | a | NA| c | d |
| 3 | NA| NA| c | d |
but I want to get like this data.frame.
| ID | name1 | name2 | name3 |
|-----|-------|-------|-------|
| 1 | a | b | NA |
| 2 | a | c | d |
| 3 | c | d | NA |
We can create a new column and spread
library(tidyverse)
d %>%
group_by(ID) %>%
mutate(new = paste0("name", row_number())) %>%
spread(new, name)
# ID name1 name2 name3
#* <dbl> <fctr> <fctr> <fctr>
#1 1 a b NA
#2 2 a c d
#3 3 c d NA
It is relatively concise with dcast
library(data.table)
dcast(setDT(d), ID~paste0("name", rowid(ID)), value.var = "name")