Related
In the data below I want to compute the following ratio tr(year)/(op(year) - op(year-1). I would appreciate an answer with dplyr.
year op tr cp
<chr> <dbl> <dbl> <dbl>
1 1984 10 39.1 38.3
2 1985 55 132. 77.1
3 1986 79 69.3 78.7
4 1987 78 47.7 74.1
5 1988 109 77.0 86.4
this is the expected output
year2 ratio
1 1985 2.933333
2 1986 2.887500
3 1987 -47.700000
4 1988 -2.483871
I do not manage to get to any result...
Use lag:
library(dplyr)
df %>%
mutate(year = year,
ratio = tr / (op - lag(op)),
.keep = "none") %>%
tidyr::drop_na()
# year ratio
#2 1985 2.933333
#3 1986 2.887500
#4 1987 -47.700000
#5 1988 2.483871
We may use
library(dplyr)
df1 %>%
reframe(year = year[-1], ratio = tr[-1]/diff(op))
-output
year ratio
1 1985 2.933333
2 1986 2.887500
3 1987 -47.700000
4 1988 2.483871
data
df1 <- structure(list(year = 1984:1988, op = c(10L, 55L, 79L, 78L, 109L
), tr = c(39.1, 132, 69.3, 47.7, 77), cp = c(38.3, 77.1, 78.7,
74.1, 86.4)), class = "data.frame", row.names = c("1", "2", "3",
"4", "5"))
I have the following DataFrame in R:
Y ... Price Year Quantity Country
010190 ... 4781 2021 4 Germany
010190 ... 367 2021 3 Germany
010190 ... 4781 2021 6 France
010190 ... 250 2021 3 France
020190 ... 690 2021 NA USA
020190 ... 10 2021 6 USA
...... ... .... .. ...
217834 ... 56 2021 3 USA
217834 ... 567 2021 9 USA
As you see the numbers in Y column startin with 01.., 02..., 21... I want to aggregate such kind of rows from 6 digit to 2 digit by considering different categorical column (e.g. Country and Year) and sum numerical columns like Quantity and Price. Also I want to take into account rows with NAs during caclulation. So, in the end I want such kind of output:
Y Price Year Quantity Country
01 5148 2021 7 Germany
01 5031 2021 9 USA
02 700 2021 6 USA
.. .... ... .... ...
21 623 2021 12 USA
You can use group_by and summarize from dplyr
library(dplyr)
df %>%
mutate(Y = sprintf(as.numeric(factor(Y, unique(Y))), fmt = '%02d')) %>%
group_by(Y, Year, Country) %>%
summarize(across(where(is.numeric), sum))
#> # A tibble: 4 x 5
#> # Groups: Y, Year [3]
#> Y Year Country Price Quantity
#> <chr> <int> <chr> <int> <int>
#> 1 01 2021 France 5031 9
#> 2 01 2021 Germany 5148 7
#> 3 02 2021 USA 700 NA
update: request:
library(dplyr)
df %>%
mutate(Y = substr(Y, 1, 2)) %>%
group_by(Y, Year, Country) %>%
summarise(across(c(Price, Quantity), ~sum(., na.rm = TRUE)))
We could use substr to get the first two characters from Y and group_by and summarise() with sum()
library(dplyr)
df %>%
mutate(Y = substr(Y, 1, 2)) %>%
group_by(Y, Year, Country) %>%
summarise(Price = sum(Price, na.rm = TRUE),
Quantity = sum(Quantity, na.rm = TRUE)
)
Y Year Country Price Quantity
<chr> <dbl> <chr> <dbl> <dbl>
1 01 2021 France 5031 9
2 01 2021 Germany 5148 7
3 02 2021 USA 700 6
4 21 2021 USA 623 12
Using aggregate and the substring of Y.
aggregate(cbind(Quantity, Price) ~ Y + Year + Country,
transform(dat, Y=substr(Y, 1, 2)), sum)
# Y Year Country Quantity Price
# 1 10 2021 France 9 5031
# 2 10 2021 Germany 7 5148
# 3 20 2021 USA 7 700
# 4 21 2021 USA 12 623
Data:
dat <- structure(list(Y = c(10190L, 10190L, 10190L, 10190L, 20190L,
20190L, 217834L, 217834L), foo = c("...", "...", "...", "...",
"...", "...", "...", "..."), Price = c(4781L, 367L, 4781L, 250L,
690L, 10L, 56L, 567L), Year = c(2021L, 2021L, 2021L, 2021L, 2021L,
2021L, 2021L, 2021L), model = c(NA, NA, NA, NA, NA, NA, "Tesla",
"Tesla"), Quantity = c(4L, 3L, 6L, 3L, 1L, 6L, 3L, 9L), Country = c("Germany",
"Germany", "France", "France", "USA", "USA", "USA", "USA")), class = "data.frame", row.names = c(NA,
-8L))
I want to calculate the weighted variance using the weights provided in the dataset, while group for the countries and cities, however the function returns NAs:
library(Hmisc) #for the 'wtd.var' function
weather_winter.std<-weather_winter %>%
group_by(country, capital_city) %>%
summarise(across(starts_with("winter"),wtd.var))
The provided output from the console (when in long format):
# A tibble: 35 x 3
# Groups: country [35]
country capital_city winter
<chr> <chr> <dbl>
1 ALBANIA Tirane NA
2 AUSTRIA Vienna NA
3 BELGIUM Brussels NA
4 BULGARIA Sofia NA
5 CROATIA Zagreb NA
6 CYPRUS Nicosia NA
7 CZECHIA Prague NA
8 DENMARK Copenhagen NA
9 ESTONIA Tallinn NA
10 FINLAND Helsinki NA
# … with 25 more rows
This is the code that I used to get the data from a wide format into a long format:
weather_winter <- weather_winter %>% pivot_longer(-c(31:33))
weather_winter$name <- NULL
names(weather_winter)[4] <- "winter"
Some example data:
structure(list(`dec-wet_2011` = c(12.6199998855591, 12.6099996566772,
14.75, 11.6899995803833, 18.2899990081787), `dec-wet_2012` = c(13.6300001144409,
14.2199993133545, 14.2299995422363, 16.1000003814697, 18.0299987792969
), `dec-wet_2013` = c(4.67999982833862, 5.17000007629395, 4.86999988555908,
7.56999969482422, 5.96000003814697), `dec-wet_2014` = c(14.2999992370605,
14.4799995422363, 13.9799995422363, 15.1499996185303, 16.1599998474121
), `dec-wet_2015` = c(0.429999977350235, 0.329999983310699, 1.92999994754791,
3.30999994277954, 7.42999982833862), `dec-wet_2016` = c(1.75,
1.29999995231628, 3.25999999046326, 6.60999965667725, 8.67999935150146
), `dec-wet_2017` = c(13.3400001525879, 13.3499994277954, 15.960000038147,
10.6599998474121, 14.4699993133545), `dec-wet_2018` = c(12.210000038147,
12.4399995803833, 11.1799993515015, 10.75, 18.6299991607666),
`dec-wet_2019` = c(12.7199993133545, 13.3800001144409, 13.9899997711182,
10.5299997329712, 12.3099994659424), `dec-wet_2020` = c(15.539999961853,
16.5200004577637, 11.1799993515015, 14.7299995422363, 13.5499992370605
), `jan-wet_2011` = c(8.01999950408936, 7.83999967575073,
10.2199993133545, 13.8899993896484, 14.5299997329712), `jan-wet_2012` = c(11.5999994277954,
11.1300001144409, 12.5500001907349, 10.1700000762939, 22.6199989318848
), `jan-wet_2013` = c(17.5, 17.4099998474121, 15.5599994659424,
13.3199996948242, 20.9099998474121), `jan-wet_2014` = c(12.5099992752075,
12.2299995422363, 15.210000038147, 9.73999977111816, 9.63000011444092
), `jan-wet_2015` = c(17.6900005340576, 16.9799995422363,
11.75, 9.9399995803833, 19), `jan-wet_2016` = c(15.6099996566772,
15.5, 14.5099992752075, 10.3899993896484, 18.4499988555908
), `jan-wet_2017` = c(9.17000007629395, 9.61999988555908,
9.30999946594238, 15.8499994277954, 11.210000038147), `jan-wet_2018` = c(8.55999946594238,
9.10999965667725, 13.2599992752075, 9.85999965667725, 15.8899993896484
), `jan-wet_2019` = c(17.0699996948242, 16.8699989318848,
14.5699996948242, 19.0100002288818, 19.4699993133545), `jan-wet_2020` = c(6.75999975204468,
6.25999975204468, 6.00999975204468, 5.35999965667725, 8.15999984741211
), `feb-wet_2011` = c(9.1899995803833, 8.63999938964844,
6.21999979019165, 9.82999992370605, 4.67999982833862), `feb-wet_2012` = c(12.2699995040894,
11.6899995803833, 8.27999973297119, 14.9399995803833, 13.0499992370605
), `feb-wet_2013` = c(15.3599996566772, 15.9099998474121,
17.0599994659424, 13.3599996566772, 16.75), `feb-wet_2014` = c(10.1999998092651,
11.1399993896484, 13.8599996566772, 10.7399997711182, 7.35999965667725
), `feb-wet_2015` = c(11.9200000762939, 12.2699995040894,
8.01000022888184, 14.5299997329712, 5.71999979019165), `feb-wet_2016` = c(14.6999998092651,
14.7799997329712, 16.7899990081787, 4.90000009536743, 19.3500003814697
), `feb-wet_2017` = c(8.98999977111816, 9.17999935150146,
11.7699995040894, 6.3899998664856, 13.9899997711182), `feb-wet_2018` = c(16.75,
16.8599987030029, 12.0599994659424, 16.1900005340576, 8.51000022888184
), `feb-wet_2019` = c(7.58999967575073, 7.26999998092651,
8.21000003814697, 7.57999992370605, 8.81999969482422), `feb-wet_2020` = c(10.6399993896484,
10.4399995803833, 13.4399995803833, 8.53999996185303, 19.939998626709
), country = c("SERBIA", "SERBIA", "SLOVENIA", "GREECE",
"CZECHIA"), capital_city = c("Belgrade", "Belgrade", "Ljubljana",
"Athens", "Prague"), weight = c(20.25, 19.75, 14.25, 23.75,
14.25)), row.names = c(76L, 75L, 83L, 16L, 5L), class = "data.frame")
Your code seems to provide the right answer, now there's more data:
# Groups: country [4]
country capital_city winter
<chr> <chr> <dbl>
1 CZECHIA Prague 27.2
2 GREECE Athens 14.6
3 SERBIA Belgrade 19.1
4 SLOVENIA Ljubljana 16.3
Is this what you were looking for?
I took the liberty of streamlining your code:
weather_winter <- weather_winter %>%
pivot_longer(-c(31:33), values_to = "winter") %>%
select(-name)
weather_winter.std <- weather_winter %>%
group_by(country, capital_city) %>%
summarise(winter = wtd.var(winter))
With only one "winter" column, there's no need for the across().
Finally, you are not using the weights. If these are needed, then change the last line to:
summarise(winter = wtd.var(winter, weights = weight))
To give:
# A tibble: 4 x 3
# Groups: country [4]
country capital_city winter
<chr> <chr> <dbl>
1 CZECHIA Prague 26.3
2 GREECE Athens 14.2
3 SERBIA Belgrade 18.8
4 SLOVENIA Ljubljana 15.8
I have a list of accounts (300k plus rows), going back six years, with a user number, open and close dates, and other information, such as location. We offer a variety of accounts, and a user can have one or several, in any combination, and both in succession as well as overlapping.
I've been asked to find out how many users we have in any given month. They'd like it split by location, as well as total.
so I have a table like this:
User Open Close Area
1 A 2018-02-13 2018-07-31 West
2 B 2018-02-26 2018-06-04 North
3 B 2018-02-27 2018-03-15 North
4 C 2018-02-27 2018-05-26 South
5 C 2018-03-15 2018-06-03 South
6 D 2018-03-20 2018-07-02 East
7 E 2018-04-01 2018-06-19 West
8 E 2018-04-14 2018-05-04 West
9 F 2018-03-20 2018-04-19 North
10 G 2018-04-26 2018-07-04 South
11 H 2017-29-12 2018-03-21 East
12 I 2016-11-29 2020-04-10 West
13 J 2018-01-31 2018-12-20 West
14 K 2017-10-31 2018-10-30 North
15 K 2018-10-31 2019-10-30 North
And I want to get to one that looks something like this:
Month Total North East South West
1 Feb 18 3 1 0 1 1
2 Mar 18 5 2 1 1 1
3 Apr 18 7 2 1 2 2
4 May 18 6 1 1 2 2
5 Jun 18 6 1 1 2 2
6 Jul 18 3 0 1 1 1
I can filter the data to get to what I need for individual months using
df%>%
filter(Open <= as.Date("2018-04-30") & Close >= as.Date("2018-04-01")) %>%
distinct(PERSON_ID, .keep_all = TRUE) %>%
count(Area)
But what I can't figure out is how to repeat that for every month in the data set automatically. Is there any where of getting r to repeat the above for every month in my data set, and then pass the results into a second table?
Any and all help gratefully received, and many thanks for your time.
Edit: added examples to the source data where Matin Gal's solution returned NA for years
This is a general solution working for dates spanning over more than one year.
library(dplyr)
library(tidyr)
library(lubridate)
data %>%
group_by(rn = row_number()) %>%
mutate(seq = list(seq(month(Open), month(Close) + 12 * (year(Close) - year(Open))))) %>%
unnest(seq) %>%
mutate(
seq_2 = (seq - 1) %% 12 + 1,
month = month(seq_2, label = TRUE),
year = year(Open + months(seq - first(seq)))
) %>%
ungroup() %>%
distinct(User, month, year, Area) %>%
count(month, year, Area) %>%
pivot_wider(
names_from = "Area",
values_from = "n",
values_fill = 0
) %>%
mutate(Total = rowSums(across(c(North, South, West, East))))
returns
month year North South West East Total
<ord> <dbl> <int> <int> <int> <int> <dbl>
1 Feb 2018 1 1 1 0 3
2 Mar 2018 2 1 1 1 5
3 Apr 2018 2 2 2 1 7
4 May 2018 1 2 2 1 6
5 Jun 2018 1 2 2 1 6
6 Jul 2018 0 1 1 1 3
Data
df <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), User = c("A",
"B", "B", "C", "C", "D", "E", "E", "F", "G"), Open = structure(c(17575,
17588, 17589, 17589, 17605, 17610, 17622, 17635, 17610, 17647
), class = "Date"), Close = structure(c(17743, 17686, 17605,
17677, 17685, 17714, 17701, 17655, 17640, 17716), class = "Date"),
Area = c("West", "North", "North", "South", "South", "East",
"West", "West", "North", "South")), problems = structure(list(
row = 10L, col = "Area", expected = "", actual = "embedded null",
file = "literal data"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")), class = c("spec_tbl_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -10L), spec = structure(list(
cols = list(id = structure(list(), class = c("collector_double",
"collector")), User = structure(list(), class = c("collector_character",
"collector")), Open = structure(list(format = ""), class = c("collector_date",
"collector")), Close = structure(list(format = ""), class = c("collector_date",
"collector")), Area = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
Here's how I'd do it:
library(tidyverse)
set.seed(14159)
## generating some data that looks roughly
## like your data
data <- tibble(
user = sample(LETTERS[1:5], size = 20, replace = TRUE),
open = sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 20),
close = sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 20),
area = sample(c("N", "E", "S", "W"), 20, replace = T)
) %>%
filter(
close > open
)
data
#> # A tibble: 9 × 4
#> user open close area
#> <chr> <date> <date> <chr>
#> 1 A 1999-04-03 1999-07-28 N
#> 2 B 1999-01-27 1999-05-12 W
#> 3 B 1999-06-05 1999-12-29 W
#> 4 C 1999-09-26 1999-12-30 W
#> 5 C 1999-04-21 1999-12-04 E
#> 6 C 1999-08-11 1999-12-12 N
#> 7 A 1999-02-13 1999-09-16 W
#> 8 E 1999-02-17 1999-05-21 E
#> 9 B 1999-07-26 1999-08-16 S
## figuring out what months are in between open and close
get_months_in_range <- function(open, close) {
seq.Date(
open,
close,
by = "month"
) %>%
list()
}
data %>%
rowwise() %>%
mutate(
Month = get_months_in_range(open, close)
) %>%
ungroup() %>%
unnest_longer(
col = Month
) %>%
count(Month, area) %>%
pivot_wider(
names_from = area,
values_from = n,
values_fill = 0
) %>%
rowwise() %>%
mutate(
Total = sum(
c_across(
-Month
)
)
) %>%
ungroup()
#> # A tibble: 45 × 6
#> Month W E N S Total
#> <date> <int> <int> <int> <int> <int>
#> 1 1999-01-27 1 0 0 0 1
#> 2 1999-02-13 1 0 0 0 1
#> 3 1999-02-17 0 1 0 0 1
#> 4 1999-02-27 1 0 0 0 1
#> 5 1999-03-13 1 0 0 0 1
#> 6 1999-03-17 0 1 0 0 1
#> 7 1999-03-27 1 0 0 0 1
#> 8 1999-04-03 0 0 1 0 1
#> 9 1999-04-13 1 0 0 0 1
#> 10 1999-04-17 0 1 0 0 1
#> # … with 35 more rows
Created on 2021-08-18 by the reprex package (v2.0.1)
It's not the world's sexiest solution, but I think it'll get you where you're trying to go. Basically, I just make a helper function that gives me all the dates between open and close and then you can group by those to figure out how many users you have in any given month. Let me know if you want more explanation about what the long chain of dplyr stuff is doing.
welcome to SO. I can't test this code as you haven't provided a snippet of your data in the right format (see below for a suggestion on this point), but I think the basic idea of what you want to do is extract a month-year value from Open and then use group_by. For example:
library(lubridate)
library(dplyr)
df %>% mutate(
Date = dmy(Open),
Month_Yr = format_ISO8601(Date, precision = "ym")) %>%
group_by(Month_Yr) %>%
distinct(PERSON.ID, .keep_all = TRUE) %>%
count(Area)
Generally when sharing data on SO it's best to use a dput. See ?dput for info on how to use it if you're unsure.
I have a data frame with the following structure:`
var1 var2 var3
año: 2005 km: 128000 marca: chevrolet
año: 2019 marca: hyundai km: 50000
marca: toyota año: 2012 km: 340000
`
I need to create new variables where the corresponding information is assigned
año marca km
2005 chevrolet 128000
2019 hyundai 50000
2012 toyota 340000
I'd love it if someone could help me with a loop for this purpose.
library(tidyverse)
df <- tibble::tribble(
~var1, ~var2, ~var3,
"ano: 2005", "km: 128000", "marca: chevrolet",
"ano: 2019", "marca: hyundai", "km: 50000",
"marca: toyota", "ano: 2012", "km: 340000"
)
df %>%
stack() %>%
select(-ind) %>%
separate(values, into = c("column", "value")) %>%
pivot_wider(value, column, values_fn = list(value = list)) %>%
unnest(cols = c(ano, marca, km))
#> # A tibble: 3 x 3
#> ano marca km
#> <chr> <chr> <chr>
#> 1 2005 toyota 128000
#> 2 2019 hyundai 50000
#> 3 2012 chevrolet 340000
Here is a base R code
pat <- c("ano","marca","km")
dfout <- setNames(data.frame(t(apply(df,
1,
function(v) trimws(gsub(".*:","",v))[match(gsub(":.*","",v),pat)]))),pat)
such that
> dfout
ano marca km
1 2005 chevrolet 128000
2 2019 hyundai 50000
3 2012 toyota 340000
DATA
df <- structure(list(var1 = c("ano: 2005", "ano: 2019", "marca: toyota"
), var2 = c("km: 128000", "marca: hyundai", "ano: 2012"), var3 = c("marca: chevrolet",
"km: 50000", "km: 340000")), class = "data.frame", row.names = c(NA,
-3L))
One way to solve it using purrr, dplyr and tidyr could be:
map_dfr(.x = split.default(df, 1:length(df)),
~ .x %>%
mutate(rowid = row_number()) %>%
separate(1, sep = ": ", into = c("column", "variable"))) %>%
pivot_wider(names_from = "column", values_from = "variable")
rowid ano marca km
<int> <chr> <chr> <chr>
1 1 2005 chevrolet 128000
2 2 2019 hyundai 50000
3 3 2012 toyota 340000