I have a large dataset with 3 columns: Name, Country, and Sales.
I'd like to sum the Sales column by Names that are both identical and occur consecutively. Then I'd like to remove all rows but the first occurrence of a series, replacing the value of Sales with the series sum.
For example:
Name,Country,Sales
A,V,100
A,W,100
B,X,100
B,Y,100
A,Z,100
Would be reduced to:
Name,Country,Sales
A,V,200
B,X,200
A,Z,100
Anyone got any idea how to do this?
Your data
df <- structure(list(Name = c("A", "A", "B"), Country = c("X", "Y",
"Z"), Sales = c(100L, 100L, 100L)), .Names = c("Name", "Country",
"Sales"), row.names = c(NA, -3L), class = c("data.table", "data.frame"
))
dplyr solution
library(dplyr)
library(data.table)
ans <- df %>%
group_by(rleid(Name)) %>%
summarise(Name = unique(Name), Sales=sum(Sales)) %>%
select(-1)
Output
Name Sales
<chr> <int>
1 A 200
2 B 100
Alternative example
newdf <- rbind(df, data.frame(Name=c("A","A","B","B"),
Country=c("A","B","C","D"),
Sales=c(100,100,100,100)))
ans <- newdf %>%
group_by(rleid(Name)) %>%
summarise(Name = unique(Name), Sales=sum(Sales)) %>%
select(-1)
Output
Name Sales
<fctr> <dbl>
1 A 200
2 B 100
3 A 200
4 B 200
Here's another solution using sqldf:
library(data.table)
df <- fread("Name,Country,Sales
A,V,100
A,W,100
B,X,100
B,Y,100
A,Z,100")
df$rle = rleid(df$Name)
library(sqldf)
sqldf("select min(rowid) as row_names,
Name,
Country,
sum(Sales) as Sales
from df group by rle", row.names = TRUE)
# Name Country Sales
# 1 A V 200
# 3 B X 200
# 5 A Z 100
row.names = TRUE searches for a column named row_names and treats it as row names, so min(rowid) will not show up as a new column if I set it as row_names.
Try this:
require(dplyr)
df %>%
group_by(Series=rleid(Name)) %>%
mutate(Sales = sum(Sales)) %>%
filter(1:n() == 1)
Output:
Name Country Sales Series
1 A V 200 1
2 B X 200 2
3 A Z 100 3
Sample data:
require(data.table)
df <- fread("Name,Country,Sales
A,V,100
A,W,100
B,X,100
B,Y,100
A,Z,100")
Related
I have a dataset with dead bird records from field observers.
Death.Date Observer Species Bird.ID
1 03/08/2021 DA MF FC10682
2 15/08/2021 AG MF FC10698
3 12/01/2022 DA MF FC20957
4 09/02/2022 DA MF FC10708
I want to produce a dataset from this with the number of unique Bird.ID / Month so I can produce a graph from that. ("unique" because some people make mistakes and enter a bird twice sometimes).
The output in this case would be:
Month Number of dead
08/2021 2
01/2022 1
02/2022 1
The idea is to use the distinct function but by month (knowing the value is in date format dd/mm/yyyy).
In case your Date column is character type first transform to date type with dmy
Change format to month and year
group_by and summarize
library(dplyr)
library(lubridate) # in case your Date is in character format
df %>%
mutate(Death.Date = dmy(Death.Date)) %>% # you may not need this line
mutate(Month = format(as.Date(Death.Date), "%m/%Y")) %>%
group_by(Month) %>%
summarise(`Number of dead`=n())
Month `Number of dead`
<chr> <int>
1 01/2022 1
2 02/2022 1
3 08/2021 2
For completeness, this can be achieved using aggregate without any additional packages:
df <- data.frame(
Death.Date = c("3/8/2021", "15/08/2021", "12/1/2022", "9/2/2022"),
Observer = c("DA", "AG", "DA", "DA"),
Species = c("MF", "MF", "MF", "MF"),
Bird.ID = c("FC10682", "FC10698", "FC20957", "FC10708")
)
aggregate.data.frame(
x = df["Bird.ID"],
by = list(death_month = format(as.Date(df$Death.Date, "%d/%m/%Y"), "%m/%Y")),
FUN = function(x) {length(unique(x))}
)
Notes
The anonymous function function(x) {length(unique(x)) provides the count of the unique values
format(as.Date(df$Death.Date, "%d/%m/%Y"), "%m/%Y")) call ensures that the month/Year string is provided
data.table solution
library(data.table)
library(lubridate)
# Reproductible example with a duplicated bird
deadbirds <- data.table::data.table(Death.Date = c("03/08/2021", "15/08/2021", "12/01/2022", "09/02/2022", "03/08/2021"),
Observer = c("DA", "AG", "DA", "DA", "DA"),
Species = c("MF", "MF", "MF" , "MF", "MF"),
Bird.ID = c("FC10682", "FC10698", "FC20957", "FC10708", "FC10682"))
# Clean dataset = option 1 : delete all duplicated row
deadbirds <- base::unique(deadbirds)
# Clean dataset = option 2 : keep only the first line by bird (can be useful when there is duplicated data with differents values in useless columns)
deadbirds <- deadbirds[
j = .SD[1],
by = c("Bird.ID")
]
# Death.Date as date
deadbirds <- deadbirds[
j = Death.Date := lubridate::dmy(Death.Date)
]
# Create month.Death.Date
deadbirds <- deadbirds[
j = month.Death.Date := base::paste0(lubridate::month(Death.Date),
"/",
lubridate::year(Death.Date))
]
# Count by month
deadbirds <- deadbirds[
j = `Number of dead` := .N,
by = month.Death.Date]
A possible solution, based on tidyverse, lubridate and zoo::as.yearmon:
library(tidyverse)
library(lubridate)
library(zoo)
df <- data.frame(
Death.Date = c("3/8/2021", "15/08/2021", "12/1/2022", "9/2/2022"),
Observer = c("DA", "AG", "DA", "DA"),
Species = c("MF", "MF", "MF", "MF"),
Bird.ID = c("FC10682", "FC10698", "FC20957", "FC10708")
)
df %>%
group_by(date = as.yearmon(dmy(Death.Date))) %>%
summarise(nDead = n_distinct(Bird.ID), .groups = "drop")
#> # A tibble: 3 x 2
#> date nDead
#> <yearmon> <int>
#> 1 Aug 2021 2
#> 2 Jan 2022 1
#> 3 Feb 2022 1
You could use:
as.data.frame(table(format(as.Date(df$Death.Date,'%d/%m/%Y'), '%m/%Y')))
# Var1 Freq
# 1 01/2022 1
# 2 02/2022 1
# 3 08/2021 2
data:
df <- data.frame(
Death.Date = c("3/8/2021", "15/08/2021", "12/1/2022", "9/2/2022"),
Observer = c("DA", "AG", "DA", "DA"),
Species = c("MF", "MF", "MF", "MF"),
Bird.ID = c("FC10682", "FC10698", "FC20957", "FC10708")
)
This question is slightly modified from this one.
I have a dataframe in long table format like this:
df1 <- data.frame(ID=c(1,1,1,1,1,1,2,2),
name=c("a","c","a","c","a","c","a","c"),
value=c("broad",50,"mangrove",50,"mangrove",50,"coniferous",50))
ID name value
1 a broad
1 c 50
1 a mangrove
1 c 50
1 a mangrove
1 c 50
2 a coniferous
2 c 50
About the data: The value from the second row 50 corresponds to the value broad from the first row. Similarly, the value from the fourth row 50 corresponds to the value mangrove from the third row and so on.. In simple words, values for name c are related with name a.
I want to combine the value in such a way that I could get the corresponding values for each name, which would also aggregate the values with similar names:
df2 <- data.frame(ID=c(1,1,2),
name=c("c_broad","c_mangrove","c_coniferous"),
value=c(50,100,50))
which should look like this:
ID name value
1 c_broad 50
1 c_mangrove 100
2 c_coniferous 50
Using reshape2:
library(reshape2)
df1$grp = cumsum(df1$name == "a")
df2 = dcast(df1, ID + grp ~ name)
df2$c = as.numeric(df2$c)
aggregate(c ~ ID + a, df2, sum)
ID a c
1 1 broad 50
2 2 coniferous 50
3 1 mangrove 100
Column names can be changed if desired, also "c_" can be added to the names with paste.
Using tidyverse:
value_a <- df1 %>% dplyr::filter(name=="a") %>% dplyr::pull(value)
df1 %>%
dplyr::filter(name=="c") %>% #Modify into a sensible data frame from here
dplyr::mutate(a = value_a,
name = stringr::str_c(name, "_" ,a)) %>%
dplyr::select(-a) %>% # to here
dplyr::group_by(ID, name) %>%
dplyr::summarise(value=sum(as.numeric(value)))
# A tibble: 3 x 3
# Groups: ID [2]
ID name value
<dbl> <chr> <dbl>
1 1 c_broad 50
2 1 c_mangrove 100
3 2 c_coniferous 50
Tha main problem you find in your dataframe is that a single column is containing, names and values, and that is the first thing you should fix. My advice is always modify the original dataframe into a tidy format (https://tidyr.tidyverse.org/articles/tidy-data.html) and from there leverage all tidyverse power, or data.table or your framework of choice.
Notice the temporal variable value_a could be included in the pipeline directly I have not done it for clarity. The main idea is to separate values and species in different columns, the first three calls in the pipeline, and then apply the usual tidyverse operations.
Might not be the most elegant, but it works:
df1 <- data.frame(ID=c(1,1,1,1,1,1,2,2),
name=c("a","c","a","c","a","c","a","c"),
value=c("broad",50,"mangrove",50,"mangrove",50,"coniferous",50)
)
df1 %>% group_by( 1+floor((1:n()-1)/2) ) %>%
summarize(
ID = ID[1],
name = paste0( name[2], "_", value[1] ),
value = as.numeric(value[2])
) %>% ungroup %>% select( -1 ) %>% group_by(name) %>%
mutate( value = sum(value) ) %>%
unique
Here is somthing improved, that actually is humanly readable:
i <- seq( 1, nrow(df1), 2 )
df1 %>% summarise(
ID = ID[i],
name = paste0( name[i+1], "_", value[i] ),
value = as.numeric(value[i+1])
) %>% group_by(name) %>%
summarize(
ID=ID[1], value = sum( value )
) %>% arrange(ID)
Base R solution:
# Nullify numeric values belonging to a grouping category: grps => character vector
grps <- gsub("\\d+", NA, df1$value)
# Interpolate NA values using prior string value: a => character vector
df1$a <- na.omit(grps)[cumsum(!(is.na(grps)))]
# Split-Apply-Combine aggregation: data.frame => stdout(console)
data.frame(do.call(rbind, lapply(with(df1, split(df1, a)), function(x){
y <- transform(subset(x, !grepl("\\D+", value)), value = as.numeric(value))
setNames(
aggregate(value ~ ID + a, y, FUN = function(z){sum(z, na.rm = TRUE)}),
c("ID", "a", "c")
)
}
)
),
row.names = NULL
)
additional option
df1 <- data.frame(ID=c(1,1,1,1,1,1,2,2),
name=c("a","c","a","c","a","c","a","c"),
value=c("broad",50,"mangrove",50,"mangrove",50,"coniferous",50))
library(tidyverse)
df1 %>%
pivot_wider(ID, names_from = name, values_from = value) %>%
unnest(c("a", "c")) %>%
group_by(ID, name = a) %>%
summarise(value = sum(as.numeric(c), na.rm = T), .groups = "drop")
#> # A tibble: 3 x 3
#> ID name value
#> <dbl> <chr> <dbl>
#> 1 1 broad 50
#> 2 1 mangrove 100
#> 3 2 coniferous 50
Created on 2021-04-12 by the reprex package (v2.0.0)
Having two dataframes with dates like this:
df1 <- data.frame(id = c(1,1,1,1,2,2), date=c("2019/12/11 20:30:12", "2019/12/12 09:20:12", "2019/12/12 11:30:40", "2019/12/13 20:12:34", "2019/12/11 12:20:12", "2019/12/11 19:20:12"), values = c(23,4,1,3,4,2))
df2 <- data.frame(id = c(1,2), date = c("2019/12/12 09:20:12", "2019/12/11 19:20:12"))
How is it possible to use the values of dates of the second dataframe to keep rows before this date into the first dataframe?
Example of expected output:
data.frame(id = c(1,1,2,2), date=c("2019/12/11 20:30:12", "2019/12/12 09:20:12", "2019/12/11 12:20:12, "2019/12/11 19:20:12"), values = c(23,4,4,2))
We can do a left_join and then filter
library(dplyr)
library(lubridate)
left_join(df1, df2, by = 'id') %>%
filter(ymd_hms(date.x) <= ymd_hms(date.y)) %>%
select(id, date = date.x, values)
#id date values
#1 1 2019/12/11 20:30:12 23
#2 1 2019/12/12 09:20:12 4
#3 2 2019/12/11 12:20:12 4
#4 2 2019/12/11 19:20:12 2
so in a dataset, I have a column named "Interventions", and each row looks like this:
row1: "Drug: Rituximab|Drug: Utomilumab|Drug: Avelumab|Drug: PF04518600"
row2: "Biological: alemtuzumab|Biological: donor lymphocytes|Drug: carmustine|Drug: cytarabine|Drug: etoposide|Drug: melphalan|Procedure: allogeneic bone marroow"
I want to only extract the Intervention type such as "Drug", "Biological", "Procedure" to remain in the column. And even better, if can only have the unique Intervention type instead of "Drug" 4 times like the first row.
The expected output would look like this:
row1: "Drug"
row2: "Biological, Drug, Procedure"
I am just getting started with r, I have tidyverse installed and kinda used to playing with the %>%. If anyone can help me with this, much appreciated !
If we want to extract only the prefix part before the :
library(dplyr)
library(stringr)
library(tidyr)
library(purrr)
df1 %>%
mutate(Interventions = map_chr(str_extract_all(Interventions,
"\\w+(?=:)"), ~ toString(sort(unique(.x)))))
# Interventions
#1 Drug
#2 Biological, Drug, Procedure
Or another option is to separate the rows based on the delimiters, slice the alternate rows and paste together the sorted unique values in 'Interventions'
df1 %>%
mutate(rn = row_number()) %>%
separate_rows(Interventions, sep="[:|]") %>%
group_by(rn) %>%
slice(seq(1, n(), by = 2)) %>%
distinct() %>%
summarise(Interventions = toString(sort(unique(Interventions)))) %>%
ungroup %>%
select(-rn)
# A tibble: 2 x 1
# Interventions
# <chr>
#1 Drug
#2 Biological, Drug, Procedure
data
df1 <- structure(list(Interventions = c("Drug: Rituximab|Drug: Utomilumab|Drug: Avelumab|Drug: PF04518600",
"Biological: alemtuzumab|Biological: donor lymphocytes|Drug: carmustine|Drug: cytarabine|Drug: etoposide|Drug: melphalan|Procedure: allogeneic bone marroow"
)), class = "data.frame", row.names = c(NA, -2L))
Not as concise and the same logic as Akruns but in Base R:
# Create df:
df1 <- structure(list(Interventions = c("Drug: Rituximab|Drug: Utomilumab|Drug: Avelumab|Drug: PF04518600",
"Biological: alemtuzumab|Biological: donor lymphocytes|Drug: carmustine|Drug: cytarabine|Drug: etoposide|Drug: melphalan|Procedure: allogeneic bone marroow"
)), class = "data.frame", row.names = c(NA, -2L))
# Assign a row id vec:
df1$row_num <- 1:nrow(df1)
# Split string on | delim:
split_up <- strsplit(df1$Interventions, split = "[|]")
# Roll down the dataframe - keep uniques:
rolled_out <- unique(data.frame(row_num = rep(df1$row_num, sapply(split_up, length)),
Interventions = gsub("[:].*","", unlist(split_up))))
# Stack the dataframe:
df2 <- aggregate(Interventions~row_num, rolled_out, paste0, collapse = ", ")
# Drop id vec:
df2 <- within(df2, rm("row_num"))
This is pretty tricky. Let's say I have, for example, a first dataset df:
sample id name
1 ID200,ID300,ID299 first
2 ID2,ID123 second
3 ID90 third
And a second dataset df_1:
ids condition
ID200 y
ID300 n
ID299 n
ID2 y
ID123 y
ID90 n
I have to filter, from the first dataset, all the rows in which all ID values satisfy a condition on the second table, like y.
So the filtering in this example should give:
sample id name
2 ID2,ID123 second
I was thinking to use something like:
new_df = df %>%
filter(grepl('ID', id), df_1$condition == 'y')
But obviously I need something different, can you give me some clues?
Edit: As I said in the comment, what happens if I have df's id column populated with other text, like this?
sample id name
1 ID = ID200,ID300,ID299,abcd first
2 ID = ID2,ID123, dfg second
3 ID = ID90, text third
Perhaps a bit inelegant, but this would give you the final condition status of each sample.
library(tidyverse)
df <- tibble(sample = c(1, 2, 3),
id = c("ID200,ID300,ID299", "ID2,ID123", "ID90"),
name = c("first", "second", "third"))
df_1 <- tibble(ids = c("ID200", "ID300", "ID299", "ID2", "ID123", "ID90"),
condition = c("y", "n", "n", "y", "y", "n"))
df2 <- df %>%
mutate(ids = str_split(id, ",")) %>%
unnest() %>%
inner_join(df_1, by = "ids") %>%
group_by(sample) %>%
summarise(condition = min(condition))
You could then join that to the to the original data frame for filtering.
filtered <- inner_join(df, df2, by = "sample") %>%
filter(condition == "y")
I'd start by tidying df so as id contains one observation per row:
library(tidyr)
library(dplyr)
df %>%
separate_rows(id)
sample id name
1 1 ID200 first
2 1 ID300 first
3 1 ID299 first
4 2 ID2 second
5 2 ID123 second
6 3 ID90 third
The same operation, followed by a join with df_1:
df %>%
separate_rows(id) %>%
left_join(df_1, by = c("id" = "ids"))
sample id name condition
1 1 ID200 first y
2 1 ID300 first n
3 1 ID299 first n
4 2 ID2 second y
5 2 ID123 second y
6 3 ID90 third n
Now you can group on sample and filter for cases where the only condition is "y":
new_df <- df %>%
separate_rows(id) %>%
left_join(df_1, by = c("id" = "ids")) %>%
group_by(sample) %>%
filter(condition == "y",
n_distinct(condition) == 1) %>%
ungroup()
Result:
sample id name condition
<int> <chr> <chr> <chr>
1 2 ID2 second y
2 2 ID123 second y
If you really want to transform back to the original format with comma-separated ids in a column:
library(purrr)
new_df %>%
nest(id) %>%
mutate(newid = map_chr(data, ~paste(.$id, collapse = ","))) %>%
select(sample, id = newid, name)
sample id name
<int> <chr> <chr>
1 2 ID2,ID123 second