create list by groups and create json [duplicate] - r

This question already has answers here:
combine data in depending on the value of one column
(2 answers)
Closed 7 years ago.
Ok I had this problem that has been solved
combine data in depending on the value of one column
I have been trying to adapt the solution for a more complicated problem but i have not been able to come with the solution instead of 2 columns i have 3
df <- structure(list(year = c(2000L, 2001L, 2002L, 2003L, 2001L, 2002L), group = c(1L, 1L, 1L, 1L, 2L, 2L), sales = c(20L, 25L, 23L, 30L, 50L, 55L), expenses = c(19L, 19L, 20L, 15L, 27L, 30L)), .Names = c("year", "group", "sales", "expenses"), class = "data.frame", row.names = c(NA, -6L))
year group sales expenses
1 2000 1 20 19
2 2001 1 25 19
3 2002 1 23 20
4 2003 1 30 15
5 2001 2 50 27
6 2002 2 55 30
And I need the same output as in the first problem but instead of just the sales I also need to include the expenses in the json file
[{"group": 1, "sales":[[2000,20],[2001, 25], [2002,23], [2003, 30]], "expenses":[[2000,19],[2001, 19], [2002,20], [2003, 15]]},
{"group": 2, "sales":[[2001, 50], [2002,55]], "expenses":[[2001, 27], [2002,30]]}]

toJSON(setDT(df1)[, list(sales= paste0('[',toString(sprintf('[%d,%d]',year, sales)),']'),
expenses= paste0('[',toString(sprintf('[%d,%d]', year, expenses)),']')), by = group])
Try this. Its not different than akrun's answer.combine data in depending on the value of one column

Related

slice() on a long skinny list with multiple entries

I have a data frame in R which is comprised like this:
year
region
age
population_count
cumulative_count*
middle_value*
2001
Region x
0
10
10
50
2001
Region x
1
10
20
50
2001
Region x
2
10
30
50
2001
Region x
3
10
40
50
2001
Region x
4
10
50
50
2001
Region x
5
10
60
50
...2020
Region y
1
10
20
50
For each year and region combination I have a discrete cumulative_count (derived from population_count by age) and middle_value (derived from the cumulative_count), again discrete for each year and region combination.
I want to extract from this the row for each region and year combination where the cumulative_count is closest to the middle_value in each instance.
(in the example above this would be age 4 in region x where culmulative_count = 50 and middle_value=50).
I have tried slice from dplyr:
slice(which.min(abs(table$cumulative_count - table$middle_value)))
but this only returns the first instance of the row where there is a match, not all the subsequent year and region combinations.
group_by(year,region) doesn't return all the possible year and region combinations either.
I feel I should be looping through the data frame for all possible year and region combinations and then slicing out the rows that meet the criteria.
Any thoughts?
UPDATE
I used #Merijn van Tilborg dplyr approach, needed only the first match.
Here's a screenshot of the output table - note that variable column is the single year of age and is getting older.
I suggest to use rank as it ranks from low to high. So if you rank on the absolute difference your grouped ranks are per definition 1 for the smallest difference. You can simply filter on that value. It also allows to set the ties with tie.methods.
include ties
dat %>%
group_by(year, region) %>%
filter(rank(abs(cumulative_count - middle_value), ties.method = "min") == 1)
# # A tibble: 6 x 6
# # Groups: year, region [4]
# year region age population_count cumulative_count middle_value
# <int> <chr> <int> <int> <int> <int>
# 1 2001 Region x 2 10 30 50
# 2 2002 Region x 2 10 30 50
# 3 2001 Region y 2 10 30 50
# 4 2002 Region y 0 10 30 50
# 5 2002 Region y 1 10 30 50
# 6 2002 Region y 2 10 30 50
show first one only
dat %>%
group_by(year, region) %>%
filter(rank(abs(cumulative_count - middle_value), ties.method = "first") == 1)
# # A tibble: 4 x 6
# # Groups: year, region [4]
# year region age population_count cumulative_count middle_value
# <int> <chr> <int> <int> <int> <int>
# 1 2001 Region x 2 10 30 50
# 2 2002 Region x 2 10 30 50
# 3 2001 Region y 2 10 30 50
# 4 2002 Region y 0 10 30 50
other options include: rank(x, na.last = TRUE, ties.method = c("average", "first", "last", "random", "max", "min"))
using data.table instead of dplyr
library(data.table)
setDT(dat) # make dat a data.table
dat[, .SD[rank(abs(cumulative_count - middle_value), ties.method = "min") == 1], by = c("year", "region")]
data
dat <- structure(list(year = c(2001L, 2001L, 2001L, 2002L, 2002L, 2002L,
2001L, 2001L, 2001L, 2002L, 2002L, 2002L), region = c("Region x",
"Region x", "Region x", "Region x", "Region x", "Region x", "Region y",
"Region y", "Region y", "Region y", "Region y", "Region y"),
age = c(0L, 1L, 2L, 0L, 1L, 2L, 0L, 1L, 2L, 0L, 1L, 2L),
population_count = c(10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L), cumulative_count = c(10L, 20L, 30L,
10L, 20L, 30L, 10L, 20L, 30L, 30L, 30L, 30L), middle_value = c(50L,
50L, 50L, 50L, 50L, 50L, 50L, 50L, 50L, 50L, 50L, 50L)), class = "data.frame", row.names = c(NA,
-12L))
You could de-mean the groups and look where the value is zero. You probably will have ties, depends on what you want, you could simply use the first one by subsetting with [1, ].
by(dat, dat[c('year', 'region')], \(x)
x[x$cumulative_count - mean(x$cumulative_count) == 0, ][1, ]) |>
do.call(what=rbind)
# year region age population_count cumulative_count middle_value
# 2 2001 Region x 1 10 20 50
# 5 2002 Region x 1 10 20 50
# 8 2001 Region y 1 10 20 50
# 10 2002 Region y 0 10 30 50
Note: R >= 4.1 used.
Data:
dat <- structure(list(year = c(2001L, 2001L, 2001L, 2002L, 2002L, 2002L,
2001L, 2001L, 2001L, 2002L, 2002L, 2002L), region = c("Region x",
"Region x", "Region x", "Region x", "Region x", "Region x", "Region y",
"Region y", "Region y", "Region y", "Region y", "Region y"),
age = c(0L, 1L, 2L, 0L, 1L, 2L, 0L, 1L, 2L, 0L, 1L, 2L),
population_count = c(10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L), cumulative_count = c(10L, 20L, 30L,
10L, 20L, 30L, 10L, 20L, 30L, 30L, 30L, 30L), middle_value = c(50L,
50L, 50L, 50L, 50L, 50L, 50L, 50L, 50L, 50L, 50L, 50L)), class = "data.frame", row.names = c(NA,
-12L))

Count an observation based on condition of another variable [duplicate]

This question already has answers here:
Counting unique / distinct values by group in a data frame
(12 answers)
Closed 1 year ago.
I have dataset of regional patent. I want to count where how many Appln_id has more than one Person_id and how many Apply_id has only one Person_id.
Appln_id 3 3 3 10 10 10 10 2 4 4
Person_id 23 22 24 49 50 55 51 101 122 104
here Appln_id 3 has three different person_id (23,22,24) and Appln_id 2 has only one Person_id(101). So, I want to count them that how many of Appln_id has more than one Person_id and how many Apply_id has only one Person_id
Count number of unique person for each Appln_id.
library(dplyr)
result <- df %>% group_by(Appln_id) %>% summarise(n = n_distinct(Person_id))
result
# Appln_id n
#* <int> <int>
#1 2 1
#2 3 3
#3 4 2
#4 10 4
Now you can count how many of them have only 1 Person_id and how many of them have more than that.
sum(result$n == 1)
#[1] 1
sum(result$n > 1)
#[1] 3
data
df <- structure(list(Appln_id = c(3L, 3L, 3L, 10L, 10L, 10L, 10L, 2L,
4L, 4L), Person_id = c(23L, 22L, 24L, 49L, 50L, 55L, 51L, 101L,
122L, 104L)), class = "data.frame", row.names = c(NA, -10L))
We can use data.table
library(data.table)
setDT(df)[, .(n = uniqueN(Person_id)), by = Appln_id]

How to take data from one data.frame and put it in another with multiple conditions in R

I have a bit of a problem to figure out how to do something. I have two data.frame, and i want to take variable to one date.frame and add it to the other with certain conditions. Here an extract of my two data.frame :
Data.frame 1 :
ID YEAR_F
154 2005
432 2005
123 2007
Data.frame 2 :
ID Year_D Month DC1 DC2
154 2001 1 4 23
154 2001 2 56 22
154 2005 1 32 11
154 2005 2 12 10
432 2005 1 23 11
432 2006 1 23 10
432 2006 2 22 11
123 2001 1 12 34
123 2007 1 11 12
123 2007 2 11 11
123 2004 1 43 43
So i want to take the DC1 and DC2 of the second data.frame and add it to my first data.frame. But, i want it to do it according to the year of the first data.frame. Plus, i want to have a column of DC1 and DC2 by month. So, in final my data.frame will look something like that :
data.frame 3 :
ID Year_D DC1_M1 DC1_M2 DC2_M1 DC2_M2
154 2005 32 12 11 10
432 2005 23 na 11 na
123 2007 11 11 12 11
I'm not really sure how to do it ? Especially because the structure of the second data.frame change ?
Thank you in advance!
We can pivot the second dataset to 'wide' format after filtering based on the 'YEAR_F' of first data and then do a join
library(dplyr)
library(tidyr)
df2 %>%
filter(Year_D %in% df1$YEAR_F) %>%
select(-Year_D) %>%
pivot_wider(names_from = Month, values_from = c(DC1, DC2)) %>%
right_join(df1) %>%
select(names(df1), everything())
-output
# A tibble: 3 x 6
# ID YEAR_F DC1_1 DC1_2 DC2_1 DC2_2
# <int> <int> <int> <int> <int> <int>
#1 154 2005 32 12 11 10
#2 432 2005 23 NA 11 NA
#3 123 2007 11 11 12 11
Or using base R with merge and reshape
merge(df1, reshape(subset(df2, Year_D %in% df1$YEAR_F, select = -Year_D),
idvar = 'ID', direction = 'wide', timevar = 'Month'))
# ID YEAR_F DC1.1 DC2.1 DC1.2 DC2.2
#1 123 2007 11 12 11 11
#2 154 2005 32 11 12 10
#3 432 2005 23 11 NA NA
data
df1 <- structure(list(ID = c(154L, 432L, 123L), YEAR_F = c(2005L, 2005L,
2007L)), class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(ID = c(154L, 154L, 154L, 154L, 432L, 432L, 432L,
123L, 123L, 123L, 123L), Year_D = c(2001L, 2001L, 2005L, 2005L,
2005L, 2006L, 2006L, 2001L, 2007L, 2007L, 2004L), Month = c(1L,
2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L), DC1 = c(4L, 56L, 32L,
12L, 23L, 23L, 22L, 12L, 11L, 11L, 43L), DC2 = c(23L, 22L, 11L,
10L, 11L, 10L, 11L, 34L, 12L, 11L, 43L)), class = "data.frame",
row.names = c(NA,
-11L))

Cut values to intervals and plot a heatmap in ggplot2

Given a dataframe as follows:
df <- structure(list(year = c(2001L, 2001L, 2001L, 2001L, 2002L, 2002L,
2002L, 2002L, 2003L, 2003L, 2003L, 2003L), quater = c(1L, 2L,
3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), value = c(4L, 23L, 14L,
12L, 6L, 22L, 45L, 12L, 34L, 15L, 3L, 40L)), class = "data.frame", row.names = c(NA,
-12L))
Out:
year quater value
0 2001 1 4
1 2001 2 23
2 2001 3 14
3 2001 4 12
4 2002 1 6
5 2002 2 22
6 2002 3 45
7 2002 4 12
8 2003 1 34
9 2003 2 15
10 2003 3 3
11 2003 4 40
How could I plot a chart similar to the plot below:
Please note the year and quater in this dataset correspondent to year and week to the plot above.
I need to first cut the value column by (0, 10], (10, 20], (20, 30], (30, 40], (40, 50] then plot them.
The code I have tried:
ggplot(df, aes(week, year, fill= value)) +
geom_tile() +
scale_fill_gradient(low="white", high="red")
Out:
As you can see, the legend is different to what I need.
Thanks for your help.
You should first use cut to get the classes (as Ronak Shah already mentioned) and then you can use scale_fill_brewer to change the color of the tiles.
library(tidyverse)
df %>%
mutate(class = cut(value, seq(0, 50, 10))) %>%
ggplot(aes(quater, year, fill = class) ) +
geom_tile() +
scale_fill_brewer(type = "seq",
direction = 1,
palette = "RdPu")

Reshape data frame from long to wide format

I have a problem like that. I have a database like:
Province cases year month
Newyork 10 2000 1
Newyork 20 2000 2
Newyork 30 2000 3
Newyork 40 2000 4
Los Angeles 30 2000 1
Los Angeles 40 2000 2
Los Angeles 50 2000 3
Los Angeles 60 2000 4
A very big data for 20 years and many Provinces. How can I regroup my data to get an sequence of time like that:
Province cases.at.1.2000 cases.at.2.2000 cases.at.3.2000 cases.at.4.2000
Newyork 10 20 30 40
Los Angeles 30 40 50 60
Just use dcast from reshape2 package:
library(reshape2)
dcast(df, Province~month+year, value.var='cases')
# Province 1_2000 2_2000 3_2000 4_2000
#1 LosAngeles 30 40 50 60
#2 Newyork 10 20 30 40
Data:
df=structure(list(Province = c("Newyork", "Newyork", "Newyork",
"Newyork", "LosAngeles", "LosAngeles", "LosAngeles", "LosAngeles"
), cases = c(10L, 20L, 30L, 40L, 30L, 40L, 50L, 60L), year = c(2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L), month = c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L)), .Names = c("Province", "cases",
"year", "month"), class = "data.frame", row.names = c(NA, -8L
))
Edit: if you have missing month/province, you can still use dcast:
# Province cases year month
#1 Newyork 10 2000 1
#2 Newyork 20 2000 2
#3 Newyork 30 2000 3
#4 Newyork 40 2000 4
#5 LosAngeles 30 2000 1
#6 LosAngeles 40 2000 2
#7 LosAngeles 50 2000 3
#8 LosAngeles 60 2000 4
#9 Newyork 99 2000 5
#10 SanDiego 99 2000 5
dcast(df, Province~month+year, value.var='cases')
# Province 1_2000 2_2000 3_2000 4_2000 5_2000
#1 LosAngeles 30 40 50 60 NA
#2 Newyork 10 20 30 40 99
#3 SanDiego NA NA NA NA 99
We can use reshape from base R after joining the 'month' and 'year' columns (paste(...))
reshape(
transform(df1, yearmonth=paste('at', month, year, sep="."))[,-(3:4)],
idvar='Province', timevar='yearmonth', direction='wide')
# Province cases.at.1.2000 cases.at.2.2000 cases.at.3.2000 cases.at.4.2000
# 1 Newyork 10 20 30 40
# 5 Los Angeles 30 40 50 60
data
df1 <- structure(list(Province = c("Newyork", "Newyork", "Newyork",
"Newyork", "Los Angeles", "Los Angeles", "Los Angeles", "Los Angeles"
), cases = c(10L, 20L, 30L, 40L, 30L, 40L, 50L, 60L), year = c(2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L), month = c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L)), .Names = c("Province", "cases",
"year", "month"), class = "data.frame", row.names = c(NA, -8L))
Based on #Ananda Mahto suggestion:
library(tidyr); library(dplyr)
df %>% mutate(month = paste0("cases.at.", month)) %>%
unite(key, month, year, sep=".") %>% spread(key, cases)
If you have missing month - year for some Province, use expand:
df %>% expand(Province, year, month) %>% left_join(df) %>%
mutate(month = paste0("cases.at.", month)) %>%
unite(key, month, year, sep=".") %>% spread(key, cases)
Data:
df=structure(list(Province = c("Newyork", "Newyork", "Newyork",
"Newyork", "LosAngeles", "LosAngeles", "LosAngeles", "LosAngeles", "SanDiego"),
cases = c(10L, 20L, 30L, 40L, 30L, 40L, 50L, 60L, 90L), year = c(2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L), month = c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 4L)), .Names = c("Province", "cases",
"year", "month"), class = "data.frame", row.names = c(NA, -9L))

Resources