Given a (simplified) dataframe with format
df <- data.frame(a = c(1,2,3,4),
b = c(4,3,2,1),
temp1 = c("-","-","-","foo: 3"),
temp2 = c("-","bar: 10","-","bar: 4")
)
a b temp1 temp2
1 4 - -
2 3 - bar: 10
3 2 - -
4 1 foo: 3 bar: 4
I need to rename all temp columns with the names contained within the column, My end goal is to end up with this:
a b foo bar
1 4 - -
2 3 - 10
3 2 - -
4 1 3 4
the df column names and the data contained within them will be unknown, however the columns that need changing will contain temp and the delimiter will always be a ":"
As such I can easily remove the name from within the columns using dplyr like this:
df <- df %>%
mutate_at(vars(contains("temp")), ~(substr(., str_locate(., ":")+1,str_length(.))))
but first I need to rename the columns based on some function method, that scans the column and returns the value(s) within it, ie.
rename_at(vars(contains("temp")), ~(...some function.....))
As per the example given there's no guarantee that specific rows will have data so I can't simply grab value from row 1
Any ideas welcome.
Thanks in advance
One possibility involving dplyr and tidyr could be:
df %>%
pivot_longer(names_to = "variables", values_to = "values", -c(a:b)) %>%
mutate(values = replace(values, values == "-", NA_character_)) %>%
separate(values, into = c("variables2", "values"), sep = ": ") %>%
group_by(variables) %>%
fill(variables2, .direction = "downup") %>%
ungroup() %>%
select(-variables) %>%
pivot_wider(names_from = "variables2", values_from = "values")
a b foo bar
<dbl> <dbl> <chr> <chr>
1 1 4 <NA> <NA>
2 2 3 <NA> 10
3 3 2 <NA> <NA>
4 4 1 3 4
If you want to further replace the NAs with -:
df %>%
pivot_longer(names_to = "variables", values_to = "values", -c(a:b)) %>%
mutate(values = replace(values, values == "-", NA_character_)) %>%
separate(values, into = c("variables2", "values"), sep = ": ") %>%
group_by(variables) %>%
fill(variables2, .direction = "downup") %>%
ungroup() %>%
select(-variables) %>%
pivot_wider(names_from = "variables2", values_from = "values") %>%
mutate_at(vars(-a, -b), ~ replace_na(., "-"))
a b foo bar
<dbl> <dbl> <chr> <chr>
1 1 4 - -
2 2 3 - 10
3 3 2 - -
4 4 1 3 4
This will do the job:
colnames(df)[which(grepl("temp", colnames(df)))] <- unique(unlist(sapply(df[,grepl("temp", colnames(df))],
function(x){gsub("[:].*",
"",
grep("\\w+",
x,
value = TRUE))})))
Related
I need to summarize one variable/column of a long table after aggregating (group_by()) by another variable/column, I need to have the summarized value by all values of other variables/columns.
Here is test data:
library(tidyverse)
set.seed(123)
Site <- str_c("S", 1:5)
Species <- str_c("Sps", 1:6)
print(Species_tbl <- bind_cols(Species = Species,
Exotic = rbinom(length(Species), 1, .3),
Migrant = rbinom(length(Species), 2, .3)))
Data_tbl <- expand.grid(Site = Site,
Species = Species) %>%
left_join(Species_tbl)
Data_tbl$Presence <- rbinom(nrow(Data_tbl), 1, .5)
And here is my best effort:
print(Data_tbl %>%
group_by(Site) %>%
summarise(N_sp = sum(Presence),
N_sp_Exo = sum(Presence[Exotic == 1]),
N_sp_Nat = sum(Presence[Exotic == 0]),
N_sp_M0 = sum(Presence[Migrant == 0]),
N_sp_M1 = sum(Presence[Migrant == 1]),
N_sp_M2 = sum(Presence[Migrant == 2])))
You can get the data in long format for your columns of interest c(Exotic, Migrant) and take sum of Presence columns for each unique column names and it's values. This can be merged with sum of each Site.
library(dplyr)
library(tidyr)
data1 <- Data_tbl %>%
group_by(Site) %>%
summarise(N_sp = sum(Presence))
data2 <- Data_tbl %>%
pivot_longer(cols = c(Exotic, Migrant)) %>%
group_by(Site, name, value) %>%
summarise(result = sum(Presence), .groups = "drop") %>%
pivot_wider(names_from = c(name, value), values_from = result)
inner_join(data1, data2, by = 'Site')
# Site N_sp Exotic_0 Exotic_1 Migrant_0 Migrant_1 Migrant_2
# <fct> <int> <int> <int> <int> <int> <int>
#1 S1 4 2 2 1 2 1
#2 S2 3 2 1 0 2 1
#3 S3 2 1 1 0 2 0
#4 S4 4 2 2 1 3 0
#5 S5 4 1 3 1 2 1
The answer has been divided in two steps for ease of readability. If you would like to do this in a single chain without creating temporary variables that can be done as well.
df_input is the input file, and the ideal output file is df_output.
df_input <- data.frame(id = c(1,2,3,4,4,5,5,5,6,7,8,9,10),
party = c("A","B","C","D","E","F","G","H","I","J","K","L","M"),
winner= c(1,1,1,1,1,1,1,1,1,1,1,1,1))
df_output <- data.frame(id = c(1,2,3,4,5,6,7,8,9,10),
party = c("A","B","C","D,E","F_G_H","I","J","K","L","M"),
winner_sum = c(1,1,1,2,3,1,1,1,1,1))
Previously the code worked using the "summarise_at" function as follows:
df_output <- df_input %>%
dplyr::group_by_at(.vars = vars(id)) %>%
{left_join(
dplyr::summarise_at(., vars(party), ~ str_c(., collapse = ",")),
dplyr::summarise_at(., vars(winner), funs(sum))
)}
But it no longer works as it seems both "summarise_at" and "funs" has been deprecated.
I am trying to replicate using across with dplyr (1.0.10), but I am getting an error. Here is my attempt:
df_output <- df_input %>%
group_by(id) %>%
summarise(across(winner, sum, na.rm=T)) %>%
summarise(across(party, str_c(., collapse = ",")))
I have multiple numeric and character variables,s not just one, as in the example. Thanks a lot.
We don't need across if we need to apply different functions on single columns
library(dplyr)
library(stringr)
df_input %>%
group_by(id) %>%
summarise(party = str_c(party, collapse = ","),
winner_sum = sum(winner))
-output
# A tibble: 10 × 3
id party winner_sum
<dbl> <chr> <dbl>
1 1 A 1
2 2 B 1
3 3 C 1
4 4 D,E 2
5 5 F,G,H 3
6 6 I 1
7 7 J 1
8 8 K 1
9 9 L 1
10 10 M 1
If there are multiple 'party', 'winner' columns, loop across them in a single summarise as after the first summarise we have only the summarised column with the group column
df_input %>%
group_by(id) %>%
summarise(across(winner, sum, na.rm=TRUE),
across(party, ~ str_c(.x, collapse = ",")), .groups = "drop")
-output
# A tibble: 10 × 3
id winner party
<dbl> <dbl> <chr>
1 1 1 A
2 2 1 B
3 3 1 C
4 4 2 D,E
5 5 3 F,G,H
6 6 1 I
7 7 1 J
8 8 1 K
9 9 1 L
10 10 1 M
NOTE: If the columns have a simplar prefix then use starts_with to select all those columns i.e. across(starts_with("party"), or if there are different column names - across(c(party, othercol), or if the functions applied are based on their type - across(where(is.numeric), sum,, na.rm = TRUE)
df_input %>%
group_by(id) %>%
summarise(across(where(is.numeric), sum, na.rm = TRUE),
across(where(is.character), str_c, collapse = ","),
.groups = 'drop')
I would like to combine two variables that have only one answer each into a single variable that has both answers.
Example
IPV_YES only has answers that are 1
IPV_NO only has answers that are 2
I would like to combine them into a single variable named IPV that would have the 1 and 2 results from both individual category.
I have tried using ifelse command but it only shows me the value of IPV_YES.
Dataset I have
My desired outcome
my answer
df %>% mutate(across(everything(), ~ifelse(. == "", NA, as.numeric(.)))) %>%
group_by(ID) %>%
rowwise() %>%
transmute(IPV = sum(c_across(everything()), na.rm = T))
# A tibble: 4 x 2
# Rowwise: ID
ID IPV
<dbl> <dbl>
1 1 1
2 2 2
3 3 1
4 4 2
data
df <- data.frame(ID = 1:4, IPV_YES = c(1,"",1,""), IPV_NO = c("",2,"",2))
We can use coalesce after converting the '' to NA
library(dplyr)
df <- df %>%
transmute(ID, IPV = coalesce(na_if(IPV_YES, ""), na_if(IPV_NO, ""))) %>%
type.convert(as.is = TRUE)
data
df <- data.frame(ID = 1:4, IPV_YES = c(1,"",1,""), IPV_NO = c("",2,"",2))
df$IPV <- ifelse(df$IPV_YES != "", df$IPV_YES, df$IPV_NO[!df$IPV_NO==""])
Here, we specify an ifelse statement; it can be glossed thus: if the value in df$IPV_YES is not blank, then give the value in df$IPV_YES, else give those values from df$IPV_NO that are not blank.
If you want to remove the IPV_* columns:
df[,2:3] <- NULL
Result:
df
ID IPV
1 1 1
2 2 2
3 3 1
4 4 2
Data:
df <- data.frame(ID = 1:4, IPV_YES = c(1,"",1,""), IPV_NO = c("",2,"",2))
Maybe you can try the code below
replace(df, df == "", NA) %>%
mutate(IPV = coalesce(IPV_YES, IPV_NO)) %>%
select(ID, IPV) %>%
type.convert(as.is = TRUE)
which gives
ID IPV
1 1 1
2 2 2
3 3 1
4 4 2
I have the following dataframe:
df <- data.frame(
ID = c(1,1,1,1,1,1,2,2,2,2,2,2),
group = c("S_1","G_1","G_2","G_3","M_1","M_2","G_1","G_2","S_1","S_2","M_1","M_2"),
CODE = c(0,1,0,0,1,1,0,1,0,0,1,1)
)
ID group CODE
1 1 S_1 0
2 1 G_1 1
3 1 G_2 0
4 1 G_3 0
5 1 M_1 1
6 1 M_2 1
7 2 G_1 0
8 2 G_2 1
9 2 S_1 0
10 2 S_2 0
11 2 M_1 1
12 2 M_2 1
I would like to summarize the CODE column such that for each ID, I end up with one row:
ID CODE
1 1 100,11,0
2 2 01,11,00
for ID==1, I would like to paste G_1,G_2,G_3 without a delimiter (in numeric order). Same goes for M_1 and M_2 and then S_1. Lastly, I would like to add the summarized G, M, and S into one row separating these by a comma (in alphabetic order).
I could potentially remove the numbers and do group_by(group) %>% summarise(CODE=paste(CODE, collapse="")) for the first step. Though I would like the final string to be in alphabetic order.
We can use tidyr::separate to get data in group in different columns based on delimiter (_) and then summarise first by ID and group1 and then by ID to get one string for each ID.
library(dplyr)
df %>%
arrange(ID,group) %>%
tidyr::separate(group, into = c('group1', 'group2'), sep = "_") %>%
group_by(ID, group1) %>%
summarise(CODE = paste(CODE, collapse = "")) %>%
summarise(CODE = toString(CODE))
# A tibble: 2 x 2
# ID CODE
# <dbl> <chr>
#1 1 100, 11, 0
#2 2 01, 11, 00
Without using separate, we can remove everything after "_" and use it as group.
df %>%
arrange(ID,group) %>%
mutate(group = sub('_.*', '', group)) %>%
group_by(ID, group) %>%
summarise(CODE = paste(CODE, collapse = "")) %>%
summarise(CODE = toString(CODE))
Base R solution:
# Order the dataframe and genericise the group vector:
ordered_df <- within(df[with(df, order(ID, group)), ], {
group <- gsub("_.*", "", group)
}
)
# Summarise the dataframe:
aggregate(CODE~ID, do.call("rbind", lapply(split(ordered_df, paste0(ordered_df$ID, ordered_df$group)),
function(x){
data.frame(ID = unique(x$ID), CODE = paste0(x$CODE, collapse = ""))
}
)
), paste, collapse = ",")
I have a question:
I have a dataset like this simple example:
df<-data.frame(ID=c("A","B","C","D"),
Score=c("15","16/18/19+2/6","3/+2","19/18/14"))
I want to end up with a dataset that has split the score numbers. I have a problem with the /+2 part. when it says "3/+2"it actually means: "3/3+2" which would finally give "3/5". So what I would like some help with, is to end up with a dataset like this:
ID Score
A 15
B 16/18/19/21/6
C 3/5
D 19/18/14
I already found out that I can then seperate the score by
df<-df %>%
mutate(Score = strsplit(as.character(ID), "/")) %>%
unnest(Score)
But I don't know how I can let the numbers duplicate and then sum when /+ occurs, could someone help me?
It could be probably solved in a more elegant way, but here is one possibility:
df %>%
mutate(Score = strsplit(as.character(Score), "/")) %>%
unnest() %>%
rowwise() %>%
mutate(Score = eval(parse(text = paste0(Score)))) %>%
group_by(ID) %>%
mutate(Score = paste0(Score, collapse = "/")) %>%
distinct()
ID Score
<fct> <chr>
1 A 15
2 B 16/18/21/6
3 C 3/5
4 D 19/18/14
Sample data:
df <- data.frame(ID=c("A","B","C","D"),
Score=c("15","16/18/19+2/6","3/3+2","19/18/14"))
It splits "Score" based on /, converts characters to expression by parse() and then transforms it back.
Using the data you provided and the pattern from #A. Suliman:
df %>%
mutate(Score = strsplit(gsub("(\\d+)/*\\+(\\d+)","\\1/\\1+\\2", Score), "/")) %>%
unnest() %>%
rowwise() %>%
mutate(Score = eval(parse(text = paste0(Score)))) %>%
group_by(ID) %>%
mutate(Score = paste0(Score, collapse = "/")) %>%
distinct()
ID Score
<fct> <chr>
1 A 15
2 B 16/18/19/21/6
3 C 3/5
4 D 19/18/14
We can use gsubfn to do this in a compact way
library(gsubfn)
library(tidyverse)
df %>%
mutate(Score = gsubfn("\\d+\\+\\d+", ~ eval(parse(text = x)), Score))
# ID Score
#1 A 15
#2 B 16/18/21/6
#3 C 3/5
#4 D 19/18/14
data
df <- data.frame(ID=c("A","B","C","D"),
Score=c("15","16/18/19+2/6","3/3+2","19/18/14"), stringsAsFactors = FALSE)
library(dplyr)
library(tidyr) #separate_rows, no need for unnest
df %>% rowwise()%>%
mutate(Score_upd=paste0(sapply(unlist(strsplit(gsub('(\\d+)/*\\+(\\d+)','\\1/\\1+\\2',Score),'/')),
function(x)eval(parse(text = x))),collapse = '/')) %>%
separate_rows(Score_upd,sep = '/')
#short version
df %>% mutate(Score=gsub('(\\d+)/*\\+(\\d+)','\\1/\\1+\\2',Score)) %>%
separate_rows(Score,sep='/') %>% rowwise() %>% mutate(Score=eval(parse(text=Score))) %>%
group_by(ID) %>% summarise(Score=paste0(Score,collapse = '/'))
# A tibble: 4 x 2
ID Score
<fct> <chr>
1 A 15
2 B 16/18/19/21/6
3 C 3/5
4 D 19/18/14
The main idea is using gsub to separate 2+3 correctly, e.g:
gsub('(\\d+)/*\\+(\\d+)','\\1/\\1+\\2','20/8/2+3') #/* means 0 or 1 occurence of / e.g, 19+2 and 3/+2.
[1] "20/8/2/2+3"
Then
valid_str <- gsub('(\\d+)/*\\+(\\d+)','\\1/\\1+\\2','20/8/2+3')
sapply(unlist(strsplit(valid_str,'/')),function(x) eval(parse(text=x)))
20 8 2 2+3
20 8 2 5
#OR
sapply(unlist(strsplit(valid_str,'/')),function(x) sum(as.numeric(unlist(strsplit(x,'\\+')))))
20 8 2 2+3
20 8 2 5