Related
I want to group by district summing 'incoming' values at quarter and get the value of the 'stock' in the last quarter (3) in just one step. 'stock' can not summed through quarters.
My example dataframe:
library(dplyr)
df <- data.frame ("district"= rep(c("ARA", "BJI", "CMC"), each=3),
"quarter"=rep(1:3,3),
"incoming"= c(4044, 2992, 2556, 1639, 9547, 1191,2038,1942,225),
"stock"= c(19547,3160, 1533,5355,6146,355,5816,1119,333)
)
df
district quarter incoming stock
1 ARA 1 4044 19547
2 ARA 2 2992 3160
3 ARA 3 2556 1533
4 BJI 1 1639 5355
5 BJI 2 9547 6146
6 BJI 3 1191 355
7 CMC 1 2038 5816
8 CMC 2 1942 1119
9 CMC 3 225 333
The actual dataframe has ~45.000 rows and 41 variables of which 8 are of type stock.
The result should be:
# A tibble: 3 × 3
district stock incoming
<chr> <dbl> <dbl>
1 ARA 1533 9592
2 BJI 355 12377
3 CMC 333 4205
I know how to get to the result but in three steps and I don't think it's efficient and error prone due to the data.
My approach:
basea <- df %>%
group_by(district) %>%
filter(quarter==3) %>% #take only the last quarter
summarise(across(stock, sum)) %>%
baseb <- df %>%
group_by(district) %>%
summarise(across(incoming, sum)) %>%
final <- full_join(basea, baseb)
Does anyone have any suggestions to perform the procedure in one (or at least two) steps?
Grateful,
Modus
Given that the dataset only has 3 quarters and not 4. If that's not the case use nth(3) instead of last()
library(tidyverse)
df %>%
group_by(district) %>%
summarise(stock = last(stock),
incoming = sum(incoming))
# A tibble: 3 × 3
district stock incoming
<chr> <dbl> <dbl>
1 ARA 1533 9592
2 BJI 355 12377
3 CMC 333 4205
here is a data.table approach
library(data.table)
setDT(df)[, .(incoming = sum(incoming), stock = stock[.N]), by = .(district)]
district incoming stock
1: ARA 9592 1533
2: BJI 12377 355
3: CMC 4205 333
Here's a refactor that removes some of the duplicated code. This also seems like a prime use-case for creating a custom function that can be QC'd and maintained easier:
library(dplyr)
df <- data.frame ("district"= rep(c("ARA", "BJI", "CMC"), each=3),
"quarter"=rep(1:3,3),
"incoming"= c(4044, 2992, 2556, 1639, 9547, 1191,2038,1942,225),
"stock"= c(19547,3160, 1533,5355,6146,355,5816,1119,333)
)
aggregate_stocks <- function(df, n_quarter) {
base <- df %>%
group_by(district)
basea <- base %>%
filter(quarter == n_quarter) %>%
summarise(across(stock, sum))
baseb <- base %>%
summarise(across(incoming, sum))
final <- full_join(basea, baseb, by = "district")
return(final)
}
aggregate_stocks(df, 3)
#> # A tibble: 3 × 3
#> district stock incoming
#> <chr> <dbl> <dbl>
#> 1 ARA 1533 9592
#> 2 BJI 355 12377
#> 3 CMC 333 4205
Here is the same solution as #Tom Hoel but without using a function to subset, instead just use []:
library(dplyr)
df %>%
group_by(district) %>%
summarise(stock = stock[3],
incoming = sum(incoming))
district stock incoming
<chr> <dbl> <dbl>
1 ARA 1533 9592
2 BJI 355 12377
3 CMC 333 4205
I Have a big data base of below sample data . I am trying to create a function which can filter Dept from Dept column one by one and the from filtered subset data create a table for all filtered managers like below.
Managers can be many or less for dept , so that it can create table dynamically for filtered managers. because sometimes managers can be 1-23 or more for dept.
I tried with the help filtering and transposing data and then binding it but that is not working properly for me as i am medium proficient in dplyr only.
df <- data.frame(Dept = c("CA","HR","CA","HR","HR","CA","HR","HR","CA","CA","HR","CA","CA"),
Manager = c("AKASH","MANU","AMAN","SANU","NISH","KAMAL","VEER","SANIL","SAMEER","KANU","NUKUL","KUNAL","RAMIT"),
PF = c("Yes","No","Yes","Yes","Yes","No","No","Yes","No","Yes","Yes","Yes","No"),
Yearlybonus=c(6946,5871,0,7173,2161,3008,0,3025,4323,4196,0,5594,2313),
Quaterlybonus=c(2683,3846,0,2391,6716,6012,5479,3869,3764,0,4632,0,2371),
monthlybonus=c(4453,6466,2811,6845,4377,2617,0,7631,7761,2944,6270,3534,5856))
using data.table for large dataset
This solution does not accumulate transformed data as list. Instead, it uses a function for filtering data
library('data.table')
setDT(df)
df[, `:=`(Yearlybonus = as.character(Yearlybonus),
Quaterlybonus = as.character(Quaterlybonus),
monthlybonus = as.character(monthlybonus))]
This function is a pure function, because it sends a copy of data (df) to function everytime. If you have less memory space, you could refactor this function to use the data (df) from global scope. The refactored code will look like the ones in commented lines.
# myfun <- function(x) {
myfun <- function(df, x) {
# wide to long
# y <- melt( df[Dept == x],
df <- melt( df[Dept == x],
id.vars = c('Manager'),
measure.vars = c('PF', 'Yearlybonus', 'Quaterlybonus', 'monthlybonus'),
variable.name = 'T' )
# long to wide
# y <- dcast(y, T ~ Manager, value.var = 'value')
df <- dcast(df, T ~ Manager, value.var = 'value')
# add dept column
#y[, Dept := x ]
df[, Dept := x ]
# set column order in memory
#nm <- names(y)
nm <- names(df)
nm <- c('Dept', nm[nm != 'Dept'])
#setcolorder(y, nm)
setcolorder(df, nm)
#return(y[])
return(df[]) }
}
# create index for speed
setkey(df, Dept)
# myfun(x = 'CA')
myfun(df = df, x = 'CA')
# Dept T AKASH AMAN KAMAL KANU KUNAL RAMIT SAMEER
#1: CA PF Yes Yes No Yes Yes No No
#2: CA Yearlybonus 6946 0 3008 4196 5594 2313 4323
#3: CA Quaterlybonus 2683 0 6012 0 0 2371 3764
#4: CA monthlybonus 4453 2811 2617 2944 3534 5856 7761
# myfun(x = 'HR')
myfun(df = df, x = 'HR')
# Dept T MANU NISH NUKUL SANIL SANU VEER
#1: HR PF No Yes Yes Yes Yes No
#2: HR Yearlybonus 5871 2161 0 3025 7173 0
#3: HR Quaterlybonus 3846 6716 4632 3869 2391 5479
#4: HR monthlybonus 6466 4377 6270 7631 6845 0
How about this:
library(tidyr)
library(dplyr)
library(purrr)
map(unique(df$Dept), ~df %>% filter(Dept == .x) %>%
mutate(across(Yearlybonus:monthlybonus, ~as.character(.x))) %>%
pivot_longer(PF:monthlybonus, names_to="T", values_to="vals") %>%
pivot_wider(names_from="Manager", values_from="vals"))
# [[1]]
# # A tibble: 4 x 9
# Dept T AKASH AMAN KAMAL SAMEER KANU KUNAL RAMIT
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 CA PF Yes Yes No No Yes Yes No
# 2 CA Yearlybonus 6946 0 3008 4323 4196 5594 2313
# 3 CA Quaterlybonus 2683 0 6012 3764 0 0 2371
# 4 CA monthlybonus 4453 2811 2617 7761 2944 3534 5856
#
# [[2]]
# # A tibble: 4 x 8
# Dept T MANU SANU NISH VEER SANIL NUKUL
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 HR PF No Yes Yes No Yes Yes
# 2 HR Yearlybonus 5871 7173 2161 0 3025 0
# 3 HR Quaterlybonus 3846 2391 6716 5479 3869 4632
# 4 HR monthlybonus 6466 6845 4377 0 7631 6270
Here's a brief look at my data
X name sex X1880 X1881
1 1 Mary F 7065 6919
2 2 Anna F 2604 2698
3 3 Emma F 2003 2034
4 4 Elizabeth F 1939 1852
5 5 Minnie F 1746 1653
Each "X----" represents a year (up to 2010), the column "name" represents a unique name for a child, and so the corresponding number between any name and year is the number of children born in year "X---" with the specified name (for example, there were 7065 Marys born in 1880).
I would like to loop through columns covering the years 1931 to 2010, find the total number of children born in that year, and then find the total number of children born in that year whose name begins with each letter of the alphabet. Finally, I would like to get the percent of children born in each year whose name begins with each letter, and store it to a list so I can plot trend lines for all letters/all years on the same graph.
Here is the code I have
allnames <- read.csv("SSA-longtail-names.csv")
girls <- subset(allnames, allnames$sex=="F")
year_columns <- as.vector(names(girls)[54:134])
percs <- list()
years <- length(year_columns)
letters <- length(LETTERS)
for (i in range(1:years)){
total = sum(girls[year_columns[i]])
for (n in range(1:letters)){
l <- toString(LETTERS[n])
sub <- girls[(grep(l, girls$name)),year_columns[i]]
sub_total <- sum(sub[year_columns[i]])
percent <- (sub_total / total) * 100
percs <- append(percs, percent)
}
}
But the for loops only go through 8 iterations, and the list percs (which is supposed to store the calculated percentages) is full of NAs. Can anyone suggest a way to fix these loops, or perhaps an even easier way to accomplish this task?
Here is an approach using dplyr, tidyr, and stringr to make a long data table by pivoting your year columns.
library(dplyr)
library(tidyr)
library(stringr)
data2 <- data %>%
pivot_longer(cols = c(-X, -name, -sex), names_to = "year", values_to = "births") %>%
complete.cases() %>% # remove NA rows
mutate(year = as.integer(str_remove(year, "X")),
first_letter = str_sub(name, start = 1, end = 1) %>%
filter(year >= 1931 & year <= 2010)
Now you can do something like:
data3 <- data2 %>%
group_by(first_letter, year) %>%
summarize(total = sum(births))
This gives you a data.frame of three columns:
first_letter year total
A 1880 17972
A 1881 16426
# etc.
Now you can do some plotting, for example with ggplot2
library(ggplot2)
# this only looks at the English vowels to make a manageable example
ggplot(data = data3 %>% filter(first_letter %in% c("A", "E", "I", "O", "U"),
aes(x = year, y = total, color = first_letter)) +
geom_line()
As mentioned, consider reshaping data to long format (the better format in data analytics for merging, cleaning, aggregating, modeling, and plotting).
Reshape
girls_long <- reshape(girls, varying = names(girls)[4:ncol(girls)], times = names(girls)[4:ncol(girls)],
idvar = c("X", "name", "sex"),
v.names = "count", timevar = "year", ids=NULL,
new.row.names = 1:1E5, direction = "long")
girls_long$year <- as.integer(gsub("X", "", girls_long$year))
girls_long
# X name sex year count
# 1 1 Mary FALSE 1880 7065
# 2 2 Anna FALSE 1880 2604
# 3 3 Emma FALSE 1880 2003
# 4 4 Elizabeth FALSE 1880 1939
# 5 5 Minnie FALSE 1880 1746
# 6 1 Mary FALSE 1881 6919
# 7 2 Anna FALSE 1881 2698
# 8 3 Emma FALSE 1881 2034
# 9 4 Elizabeth FALSE 1881 1852
# 10 5 Minnie FALSE 1881 1653
Aggregations
# Total number of children born in that year
total_df <- aggregate(name ~ year, girls_long, FUN=length)
total_df
# year count
# 1 1880 15357
# 2 1881 15156
# Total number of children born in that year whose name begins with each letter of the alphabet
girls_long$name_letter <- substring(girls_long$name, 1, 1)
girls_agg <- aggregate(cbind(count=name) ~ name_letter + year, girls_long, FUN=length)
girls_agg
# name_letter year count
# 1 A 1880 2604
# 2 E 1880 3942
# 3 M 1880 8811
# 4 A 1881 2698
# 5 E 1881 3886
# 6 M 1881 8572
# Percent of children born in each year whose name begins with each letter
girls_agg$percent <- with(girls_agg, count / ave(count, year, FUN=sum))
girls_agg
# name_letter year count percent
# 1 A 1880 2604 0.1695644
# 2 E 1880 3942 0.2566908
# 3 M 1880 8811 0.5737449
# 4 A 1881 2698 0.1780153
# 5 E 1881 3886 0.2564001
# 6 M 1881 8572 0.5655846
I've split the solution into the three parts you describe. If you are only after the percentages, you can ignore the first part (total) and combine the second and third:
library(dplyr)
library(stringr)
library(tidyr)
data <- tibble(name = c('Mary', 'Anna', 'Emma', 'Elizabeth', 'Minnie'),
sex = rep('F', 5),
X1880 = c(7065, 2604, 2003, 1939, 1746),
X1881 = c(6919, 2698, 2034, 1852, 1653))
total <- data %>%
summarise(across(X1880:X1881, sum)) %>%
pivot_longer(everything(), names_to = 'year', values_to = 'total')
total
# year total
# <chr> <dbl>
# 1 X1880 15357
# 2 X1881 15156
totalPerLetter <- data %>%
mutate(letter = str_extract(name, '^.')) %>%
select(letter, starts_with('X')) %>%
pivot_longer(-letter, names_to = 'year', values_to = 'count') %>%
group_by(letter, year) %>%
mutate(count = sum(count)) %>%
distinct()
totalPerLetter
# letter year count
# <chr> <chr> <dbl>
# 1 M X1880 8811
# 2 M X1881 8572
# 3 A X1880 2604
# 4 A X1881 2698
# 5 E X1880 3942
# 6 E X1881 3886
pctPerLetter <- totalPerLetter %>%
group_by(year) %>%
mutate(total = sum(count)) %>%
ungroup() %>%
mutate(percent = count/(total/100))
pctPerLetter
# letter year count total percent
# <chr> <chr> <dbl> <dbl> <dbl>
# 1 M X1880 8811 15357 57.4
# 2 M X1881 8572 15156 56.6
# 3 A X1880 2604 15357 17.0
# 4 A X1881 2698 15156 17.8
# 5 E X1880 3942 15357 25.7
# 6 E X1881 3886 15156 25.6
I checked similar entries in SO, none answers my question exactly.
My problem is this:
Let's say, User1 has 6 purchases, User2 has 2.
Purchase data is something like this:
set.seed(1234)
purchase <- data.frame(id = c(rep("User1", 6), rep("User2", 2)),
purchaseid = sample(seq(1, 100, 1), 8),
purchaseDate = seq(Sys.Date(), Sys.Date() + 7, 1),
price = sample(seq(30, 200, 10), 8))
#
users <- data.frame(id = c("User1","User2"),
uname = c("name1", "name2"),
uaddress = c("add1", "add2"))
> purchase
id purchaseid purchaseDate price
1 User1 12 2019-09-27 140
2 User1 62 2019-09-28 110
3 User1 60 2019-09-29 200
4 User1 61 2019-09-30 190
5 User1 83 2019-10-01 60
6 User1 97 2019-10-02 150
7 User2 1 2019-10-03 160
8 User2 22 2019-10-04 120
End data required includes 1 row for each user, that keeps the user name, address, etc. Then comes next columns for 20 purchases. The purchase data needs to be placed one after another in the same row. This is the rule: only one row for each user. If the user does not have 20 purchases, the remaining fields should be empty.
End data should therefore look like this:
id uname uaddr p1id p1date p1price p2id p2date p2price p3id p3date p3price p4id
1 User1 name1 add1 12 2019-09-27 140 62 2019-09-28 110 60 2019-09-29 200 61
2 User2 name2 add2 1 2019-10-03 160 22 2019-10-04 120 NA <NA> NA NA
p4date p4price
1 2019-09-30 190
2 <NA> NA
enddata <- data.frame(id = c("User1", "User2"),
uname = c("name1", "name2"),
uaddr = c("add1", "add2"),
p1id = c(12,1),
p1date = c("2019-09-27","2019-10-03"),
p1price = c(140, 160),
p2id = c(62, 22),
p2date = c("2019-09-28", "2019-10-04"),
p2price = c(110, 120),
p3id = c(60, NA),
p3date = c("2019-09-29", NA),
p3price = c(200, NA),
p4id = c(61, NA),
p4date = c("2019-09-30", NA),
p4price = c(190, NA))
I used reshape to get the data for each user into the wide format. The idea was doing it in a loop for each user id. Then I used rbindlist with the fill option TRUE, but this time I am having problem with column names. After reshape, each gets different column names. Without fixed number of columns, you cannot set names either.
Any elegant solution to this?
There's no need to process each id separately. Instead we can operate by id within a single data frame. Below is a tidyverse approach. You can stop the chain at any point to see the intermediate output. I've added comments to explain what the code is doing, but let me know if anything is unclear.
library(tidyverse)
dat = users %>%
# Join purchase data to user data
left_join(purchase) %>%
arrange(purchaseDate) %>%
# Create a count column to assign a sequence number to each purchase within each id.
# We'll use this later to create columns for each purchase event with a unique
# sequence number for each purchase.
group_by(id) %>%
mutate(seq=1:n()) %>%
ungroup %>%
# Reshape data frame to from "wide" to "long" format
gather(key, value, purchaseid:price) %>%
arrange(seq) %>%
# Paste together the "key" and "seq" columns (the resulting column will still be
# called "key"). This will allow us to spread the data frame to one row per id
# with each purchase event properly numbered.
unite(key, key, seq, sep="_") %>%
mutate(key = factor(key, levels=unique(key))) %>%
spread(key, value) %>%
# Convert date columns back to Date class
mutate_at(vars(matches("Date")), as.Date, origin="1970-01-01")
dat
id uname uaddress purchaseid_1 purchaseDate_1 price_1 purchaseid_2 purchaseDate_2 price_2
1 User1 name1 add1 12 2019-09-27 140 62 2019-09-28 110
2 User2 name2 add2 1 2019-10-03 160 22 2019-10-04 120
purchaseid_3 purchaseDate_3 price_3 purchaseid_4 purchaseDate_4 price_4 purchaseid_5 purchaseDate_5
1 60 2019-09-29 200 61 2019-09-30 190 83 2019-10-01
2 NA <NA> NA NA <NA> NA NA <NA>
price_5 purchaseid_6 purchaseDate_6 price_6
1 60 97 2019-10-02 150
2 NA NA <NA> NA
Another option using data.table:
#pivot to wide format
setDT(users)
setDT(purchase)[, pno := rowid(id)]
ans <- dcast(purchase[users, on=.(id)], id + uname + uaddress ~ pno,
value.var=c("purchaseid","purchaseDate", "price"))
#reorder columns
nm <- grep("[1-9]$", names(ans), value=TRUE)
setcolorder(ans, c(setdiff(names(ans), nm), nm[order(gsub("(.*)_", "", nm))]))
ans
output:
id uname uaddress purchaseid_1 purchaseDate_1 price_1 purchaseid_2 purchaseDate_2 price_2 purchaseid_3 purchaseDate_3 price_3 purchaseid_4 purchaseDate_4 price_4 purchaseid_5 purchaseDate_5 price_5 purchaseid_6 purchaseDate_6 price_6
1: User1 name1 add1 12 2019-09-30 140 62 2019-10-01 110 60 2019-10-02 200 61 2019-10-03 190 83 2019-10-04 60 97 2019-10-05 150
2: User2 name2 add2 1 2019-10-06 160 22 2019-10-07 120 NA <NA> NA NA <NA> NA NA <NA> NA NA <NA> NA
I've got the following three dataframes:
df1 <- data.frame(name=c("John", "Anne", "Christine", "Andy"),
age=c(31, 26, 54, 48),
height=c(180, 175, 160, 168),
group=c("Student",3,5,"Employer"), stringsAsFactors=FALSE)
df2 <- data.frame(name=c("Anne", "Christine"),
age=c(26, 54),
height=c(175, 160),
group=c(3,5),
group2=c("Teacher",6), stringsAsFactors=FALSE)
df2 <- data.frame(name=c("Christine"),
age=c(54),
height=c(160),
group=c(5),
group2=c(6),
group3=c("Scientist"), stringsAsFactors=FALSE)
I'd like to combine them so that I get the following result:
df.all <- data.frame(name=c("John", "Anne", "Christine", "Andy"),
age=c(31, 26, 54, 48),
height=c(180, 175, 160, 168),
group=c("Student", "Teacher", "Scientist", "Employer"))
At the moment I'm doing it this way:
df.all <- merge(merge(df1[,c(1,4)], df2[,c(1,5)], all=TRUE, by="name"),
df3[,c(1,6)], all=TRUE, by="name")
row.ind <- which(df.all$group %in% c(6,5))
df.all[row.ind, c("group")] <- df.all[row.ind, c("group2")]
row.ind2 <- which(df.all$group2 %in% c(6))
df.all[row.ind2, c("group")] <- df.all[row.ind2, c("group3")]
This isn't generalisable and it is really messy. Maybe there would be a way to use merge_all or merge_recurse for the merging step (especially as there might be more than two dataframes to be merged), but I haven't figured out how. These two don't produce the right result:
df.all <- merge_all(list(df1, df2, df3))
df.all <- merge_recurse(list(df1, df2, df3), by=c("name"))
Is there a more general and elegant way to solve this problem?
Here is another possible approach, if I understand what you're ultimately after. (It is not clear what the numeric values in the "group" columns are, so I'm not sure this is exactly what you're looking for.)
Use Reduce() to merge your multiple data.frames.
temp <- Reduce(function(x, y) merge(x, y, all=TRUE), list(df1, df2, df3))
names(temp)[4] <- "group1" # Rename "group" to "group1" for reshaping
temp
# name age height group1 group2 group3
# 1 Andy 48 168 Employer <NA> <NA>
# 2 Anne 26 175 3 Teacher <NA>
# 3 Christine 54 160 5 6 Scientist
# 4 John 31 180 Student <NA> <NA>
Use reshape() to reshape your data from wide to long.
df.all <- reshape(temp, direction = "long", idvar="name", varying=4:6, sep="")
df.all
# name age height time group
# Andy.1 Andy 48 168 1 Employer
# Anne.1 Anne 26 175 1 3
# Christine.1 Christine 54 160 1 5
# John.1 John 31 180 1 Student
# Andy.2 Andy 48 168 2 <NA>
# Anne.2 Anne 26 175 2 Teacher
# Christine.2 Christine 54 160 2 6
# John.2 John 31 180 2 <NA>
# Andy.3 Andy 48 168 3 <NA>
# Anne.3 Anne 26 175 3 <NA>
# Christine.3 Christine 54 160 3 Scientist
# John.3 John 31 180 3 <NA>
Take advantage of the fact that as.numeric() will coerce characters to NA, and use na.omit() to remove all of the rows with NA values.
na.omit(df.all[is.na(as.numeric(df.all$group)), ])
# name age height time group
# Andy.1 Andy 48 168 1 Employer
# John.1 John 31 180 1 Student
# Anne.2 Anne 26 175 2 Teacher
# Christine.3 Christine 54 160 3 Scientist
Again, this might be over-generalizing your problem--there might be NA values in other columns, for example--but it might help direct you towards a solution to your problem.
First step is to use merge_recurse with all.x = TRUE:
library(reshape)
merge.all <- merge_recurse(list(df1, df2, df3), all.x = TRUE)
# name age height group group2 group3
# 1 Anne 26 175 3 Teacher <NA>
# 2 Christine 54 160 5 6 Scientist
# 3 John 31 180 Student <NA> <NA>
# 4 Andy 48 168 Employer <NA> <NA>
Then you can use apply to get the last non-NA group from all the "group" columns:
group.cols <- grep("group", colnames(merge.all))
merge.all <- data.frame(merge.all[-group.cols],
group = apply(merge.all[group.cols], 1,
function(x)tail(na.omit(x), 1)))
# name age height group
# 1 Anne 26 175 Teacher
# 2 Christine 54 160 Scientist
# 3 John 31 180 Student
# 4 Andy 48 168 Employer