I have a dataset of pairs of cities V1 and V2. Each cities has a population v1_pop2015 and v2_pop2015.
I would like to create a new dataset with only the cityCode of the biggest city and its populated added of the population of the smallest.
I was able to create the output I want with a for loop. For educationnal purpose, I tried to do it using tidyverse tools without success.
This is a working sample
library(tidyverse)
## Sample dataset
pairs_pop <- structure(list(cityCodeV1 = c(20073, 20888, 20222, 22974, 23792,
20779), cityCodeV2 = c(20063, 204024, 20183, 20406, 23586, 23595
), v1_pop2015 = c(414, 682, 497, 3639, 384, 596), v2_pop2015 = c(384,
757, 5716, 315, 367, 1303)), row.names = c(NA, 6L), class = c("tbl_df",
"tbl", "data.frame"))
pairs_pop
#> # A tibble: 6 x 4
#> cityCodeV1 cityCodeV2 v1_pop2015 v2_pop2015
#> * <dbl> <dbl> <dbl> <dbl>
#> 1 20073 20063 414 384
#> 2 20888 204024 682 757
#> 3 20222 20183 497 5716
#> 4 22974 20406 3639 315
#> 5 23792 23586 384 367
#> 6 20779 23595 596 1303
#### This is working !!!
clean_df <- setNames(data.frame(matrix(ncol = 2, nrow = dim(pairs_pop)[1])),c("to_keep", "to_keep_pop"))
# For each row, determine which city is the biggest and adds the two cities population
for (i in 1:dim(pairs_pop)[1]) {
if(pairs_pop$v1_pop2015[i] > pairs_pop$v2_pop2015[i])
{
clean_df$to_keep[i] = pairs_pop$cityCodeV1[i]
clean_df$to_keep_pop[i] = pairs_pop$v1_pop2015[i] + pairs_pop$v2_pop2015[i]
}
else
{
clean_df$to_keep[i] = pairs_pop$cityCodeV2[i]
clean_df$to_keep_pop[i] = pairs_pop$v1_pop2015[i] + pairs_pop$v2_pop2015[i]
}
}
clean_df
#> to_keep to_keep_pop
#> 1 20073 798
#> 2 204024 1439
#> 3 20183 6213
#> 4 22974 3954
#> 5 23792 751
#> 6 23595 1899
This is where I'm stucked
### trying to tidy it with rowwise, mutate and a function
v1_sup_tov2 <- function(x){
print(x)
if(x$v1_pop2015 > x$v2_pop2015){
return (TRUE)
}
return(FALSE)
}
to_clean_df2 <- pairs_pop %>%
rowwise() %>%
mutate_if(v1_sup_tov2,
to_keep = cityCodeV1,
to_delete= cityCodeV2,
to_keep_pop = v1_pop2015 + v2_pop2015)
The expected output is a dataframe with 2 colums like this:
to_keep: cityCode of the city I want to keep
to_keep_pop: population of that city
clean_df
#> to_keep to_keep_pop
#> 1 20073 798
#> 2 204024 1439
#> 3 20183 6213
#> 4 22974 3954
#> 5 23792 751
#> 6 23595 1899
What about this?
library(dplyr)
## Sample dataset
pairs_pop <- structure(
list(cityCodeV1 = c(20073, 20888, 20222, 22974, 23792, 20779),
cityCodeV2 = c(20063, 204024, 20183, 20406, 23586, 23595),
v1_pop2015 = c(414, 682, 497, 3639, 384, 596),
v2_pop2015 = c(384, 757, 5716, 315, 367, 1303)),
row.names = c(NA, 6L), class = c("tbl_df", "tbl", "data.frame"))
clean_df <- transmute(pairs_pop,
to_keep = if_else(v1_pop2015 > v2_pop2015, cityCodeV1, cityCodeV2),
to_keep_pop = v1_pop2015 + v2_pop2015)
Just in case one day you get multiple cities with v1, v2, v3, ...
Do not forget to keep all information in your dataframe so that you know what value is related to what. A tidy dataframe.
library(dplyr)
## Sample dataset
pairs_pop <- structure(
list(cityCodeV1 = c(20073, 20888, 20222, 22974, 23792, 20779),
cityCodeV2 = c(20063, 204024, 20183, 20406, 23586, 23595),
v1_pop2015 = c(414, 682, 497, 3639, 384, 596),
v2_pop2015 = c(384, 757, 5716, 315, 367, 1303)),
row.names = c(NA, 6L), class = c("tbl_df", "tbl", "data.frame"))
# Tidy dataset with all information that was in columns
library(dplyr)
library(tidyr)
library(stringr)
tidy_pairs <- pairs_pop %>%
mutate(city = 1:n()) %>%
gather("key", "value", -city) %>%
mutate(ville = str_extract(key, "([[:digit:]])"),
key = case_when(
grepl("cityCode", key) ~ "cityCode",
grepl("pop", key) ~ "pop",
TRUE ~ "other"
)) %>%
spread(key, value)
And then you can apply the test you want
tidy_pairs %>%
group_by(city) %>%
summarise(to_keep = cityCode[pop == max(pop)],
to_keep_pop = sum(pop))
Related
I am trying to pair rows for use in a dumbbell plot. I have a df that looks like this:
Year
Species
Tonnes
1960
Cod
123
1961
Cod
456
1970
Cod
124
1971
Cod
457
I want to pair the up results 10 years apart, resulting in this df:
Year
Species
Tonnes
Pair
1960
Cod
123
1
1961
Cod
456
2
1970
Cod
124
1
1971
Cod
457
2
I would very much appreciate help. I wasn't too sure where to begin with the problem.
You could do
df <- structure(list(Year = c(1960L, 1961L, 1970L, 1971L), Species = c("Cod",
"Cod", "Cod", "Cod"), Tonnes = c(123, 150, 256, 450)), row.names = c(NA,
-4L), class = "data.frame")
library(tidyverse)
df %>%
mutate(year = Year %% 10,
decade = 10 * Year %/% 10) %>%
select(-Year) %>%
group_by(Species, year) %>%
summarize(from = Tonnes[which.min(decade)],
to = Tonnes[which.max(decade)],
year = paste(min(year + decade), max(year + decade), sep = '-')) %>%
ggplot(aes(from, year)) +
geom_linerange(aes(xmin = from, xmax = to), alpha = 0.5) +
geom_point(color = 'green4', size = 3) +
geom_point(aes(x = to), color = 'red3', size = 3) +
xlab('Tonnes') +
theme_minimal(base_size = 16)
Using data.table, a join will get the pairs in wide format:
library(data.table)
dt <- setDT(df)[
, `:=`(Year2 = Year + 10, Pair = rleid(Year, Species))
][
df,
.(Year1 = i.Year, Year2 = x.Year, Species, Tonnes1 = i.Tonnes, Tonnes2 = Tonnes, Pair = i.Pair),
on = .(Year = Year2, Species), nomatch = 0
]
dt
#> Year1 Year2 Species Tonnes1 Tonnes2 Pair
#> 1: 1960 1970 Cod 123 124 1
#> 2: 1961 1971 Cod 456 457 2
which can be melted to long format, if desired:
setcolorder(
melt(dt, c("Species", "Pair"), list(c("Year1", "Year2"), c("Tonnes1", "Tonnes2")), value.name = c("Year", "Tonnes")),
c("Year", "Species", "Tonnes", "Pair")
)[, variable := NULL][]
#> Year Species Tonnes Pair
#> 1: 1960 Cod 123 1
#> 2: 1961 Cod 456 2
#> 3: 1970 Cod 124 1
#> 4: 1971 Cod 457 2
Data:
df <- data.frame(Year = c(1960, 1961, 1970, 1971), Species = "Cod", Tonnes = c(123, 456, 124, 457))
I am trying to disaggregate the monthly data and spread them into weekly data in two ways.
First, To find the first Monday from the start date and then create days which are Mondays till the last date (month) of the sequence. And then spread the data within the respective week which is in the month.
Second, To create a weekly sequence from start date and end date and spread the data within the respective week which is in the month.
The data which I am working with is given below:
structure(list(`Row Labels` = c("X6", "X7", "X8", "X9"), `2022-11-01` = c(100,
200, 300, 400), `2022-12-01` = c(160, 200, 300, 400), `2023-01-01` = c(500,
550, 600, 650)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-4L))
And it looks like this:
The expected output 1 is given below, as you can see all the dates are Mondays:
The expected output 2 is given below:
Is this doable, or is it a bit too much to expect from R?
For Mondays we can create a list of Mondays between the dates in the dataframe, join it with the data in long format, count number of the Mondays for each variable in each month, divide the values by the number of Mondays, and revert back the format to wide;
library(dplyr)
library(tidyr)
library(lubridate)
all_dates <- as.Date(names(df1)[-1])
MON <- seq(min(floor_date(all_dates, "month")),
max(ceiling_date(all_dates, "month")),
by="1 day") %>%
.[wday(.,label = TRUE) == "Mon"] %>%
data.frame("Mondays" = .) %>%
mutate(mmm = format(Mondays, "%Y-%m"))
df1 %>%
pivot_longer(cols = -`Row Labels`, names_to = "dates") %>%
mutate(dates = as.Date(dates),
mmm = format(dates, "%Y-%m")) %>%
right_join(MON, by = "mmm") %>%
arrange(mmm) %>%
group_by(`Row Labels`, dates) %>%
mutate(value = value / n()) %>%
ungroup() %>%
select(`Row Labels`, Mondays, value) %>%
pivot_wider(`Row Labels`, names_from = "Mondays", values_from = "value")
#> # A tibble: 4 x 14
#> `Row Labels` `2022-11-07` `2022-11-14` `2022-11-21` `2022-11-28` `2022-12-05`
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 X6 25 25 25 25 40
#> 2 X7 50 50 50 50 50
#> 3 X8 75 75 75 75 75
#> 4 X9 100 100 100 100 100
#> # ... with 8 more variables: 2022-12-12 <dbl>, 2022-12-19 <dbl>,
#> # 2022-12-26 <dbl>, 2023-01-02 <dbl>, 2023-01-09 <dbl>, 2023-01-16 <dbl>,
#> # 2023-01-23 <dbl>, 2023-01-30 <dbl>
Same principal goes to doing it weekly:
WKLY <- seq(min(floor_date(all_dates, "month")),
max(ceiling_date(all_dates, "month")),
by="week") %>%
data.frame("Weekly" = .) %>%
mutate(mmm = format(Weekly, "%Y-%m"))
df1 %>%
pivot_longer(cols = -`Row Labels`, names_to = "dates") %>%
mutate(dates = as.Date(dates),
mmm = format(dates, "%Y-%m")) %>%
right_join(WKLY, by = "mmm") %>%
arrange(mmm) %>%
group_by(`Row Labels`, dates) %>%
mutate(value = value / n()) %>%
ungroup() %>%
select(`Row Labels`, Weekly, value) %>%
pivot_wider(`Row Labels`, names_from = "Weekly", values_from = "value")
#> # A tibble: 4 x 15
#> `Row Labels` `2022-11-01` `2022-11-08` `2022-11-15` `2022-11-22` `2022-11-29`
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 X6 20 20 20 20 20
#> 2 X7 40 40 40 40 40
#> 3 X8 60 60 60 60 60
#> 4 X9 80 80 80 80 80
#> # ... with 9 more variables: 2022-12-06 <dbl>, 2022-12-13 <dbl>,
#> # 2022-12-20 <dbl>, 2022-12-27 <dbl>, 2023-01-03 <dbl>, 2023-01-10 <dbl>,
#> # 2023-01-17 <dbl>, 2023-01-24 <dbl>, 2023-01-31 <dbl>
Data:
df1 <- structure(list(`Row Labels` = c("X6", "X7", "X8", "X9"),
`2022-11-01` = c(100, 200, 300, 400),
`2022-12-01` = c(160, 200, 300, 400),
`2023-01-01` = c(500, 550, 600, 650)),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -4L))
I am trying to replace some text in my dataframe (a few rows given below)
> dput(Henry.longer[1:4,])
structure(list(N_l = c(4, 4, 4, 4), UG = c("100", "100", "100",
"100"), S = c(12, 12, 12, 12), Sample = c(NA, NA, NA, NA), EQ = c("Henry",
"Henry", "Henry", "Henry"), DF = c(0.798545454545455, 0.798545454545455,
0.798545454545455, 0.798545454545455), meow = c("Henry.Exterior.single",
"Multi", "Henry.Exterior.multi", "Henry.Interior.single"), Girder = c("Henry.Exterior.single",
"Henry.Interior.multi", "Henry.Exterior.multi", "Interior")), row.names = c(NA,
-4L), groups = structure(list(UG = "100", S = 12, .rows = list(
1:4)), row.names = c(NA, -1L), class = c("tbl_df", "tbl",
"data.frame"), .drop = FALSE), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
I try to mutate the dataframe as:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.multi", "Multi")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.multi", "Multi")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.multi", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.single", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.multi", "Interior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.single", "Interior")) %>%
select(-meow)
But for some reason the results does not get applied to all the rows and only:
N_l UG S Sample EQ DF Loading Girder
1 4 100 12 NA Henry 0.799 Henry.Exterior.single Henry.Exterior.single
2 4 100 12 NA Henry 0.799 Multi Henry.Interior.multi
3 4 100 12 NA Henry 0.799 Henry.Exterior.multi Henry.Exterior.multi
4 4 100 12 NA Henry 0.799 Henry.Interior.single Interior
I think we can use lookup vectors for this, if it's easy or safer to use static string lookups:
tr_vec <- c(Henry.Exterior.single = "Single", Henry.Exterior.multi = "Multi", Henry.Interior.single = "Single", Henry.Interior.multi = "Multi")
tr_vec2 <- c(Henry.Exterior.multi = "Exterior", Henry.Exterior.single = "Exterior", Henry.Interior.multi = "Interior", Henry.Interior.single = "Interior")
Henry.longer %>%
mutate(
Loading = coalesce(tr_vec[Loading], Loading),
Girder = coalesce(tr_vec2[Girder], Girder)
)
# # A tibble: 4 x 8
# # Groups: UG, S [1]
# N_l UG S Sample EQ DF Loading Girder
# <dbl> <chr> <dbl> <lgl> <chr> <dbl> <chr> <chr>
# 1 4 100 12 NA Henry 0.799 Single Exterior
# 2 4 100 12 NA Henry 0.799 Multi Interior
# 3 4 100 12 NA Henry 0.799 Multi Exterior
# 4 4 100 12 NA Henry 0.799 Single Interior
The advantage of RonakShah's regex solution is that it can very easily handle many of the types of substrings you appear to need. Regexes do carry a little risk, though, in that they may (unlikely in that answer, but) miss match.
Instead of using str_replace I guess it would be easier to extract what you want using regex.
library(dplyr)
Henry.longer %>%
mutate(Loading = sub('.*\\.', '', meow),
Girder = sub('.*\\.(\\w+)\\..*', '\\1', meow))
where
Loading - removes everything until last dot
Girder - extracts a word between two dots.
Oh boy, looks like you've got some answers here already but here's a super-simple one that uses stringr::str_extract:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_extract(meow, "single|multi")) %>%
mutate(Girder = str_extract(meow, "Interior|Exterior"))
It's worth noting that the demo data has a weird entry for meow in one column, so it didn't run perfectly on my machine:
I have a count of stems by tree species for different plots.
structure(list(Plot = c(1, 2), Pine = c(0, 430), Spruce = c(708,
1241), Birch = c(119, 48), Aspen = c(0, 0), Salix = c(0, 0),
Rowan = c(0, 0), Alnus = c(0, 0), stem_sum = c(827, 1719)), row.names = c(NA,
-2L), groups = structure(list(.rows = structure(list(1L, 2L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame")), class = c("rowwise_df", "tbl_df", "tbl",
"data.frame"))
What I want to do is use dplyr 1.0 convention to mutate a new column, "Main species", if any of the tree species columns exceed 80% of the stem_sum of that plot.
My thought process:
df %>% rowwise() %>% mutate(`Main species`= c_across(Pine:Alnus, if(.. / stem/sum >= 0.8, paste(...))
How can I modify this code such that if there are more than one column which fulfils the requirement, the output will be "Mixed"?
You can use :
library(dplyr)
df %>%
rowwise() %>%
mutate(Main_Species = if(any(c_across(Pine:Alnus) >= 0.8 * stem_sum))
'Mixed' else 'Not Mixed')
# Plot Pine Spruce Birch Aspen Salix Rowan Alnus stem_sum Main_Species
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#1 1 0 708 119 0 0 0 0 827 Mixed
#2 2 430 1241 48 0 0 0 0 1719 Not Mixed
Or in base R :
df$Main_species <- ifelse(rowSums(df[2:8] >= df$stem_sum * 0.8) > 0,
'Mixed', 'Not Mixed')
library(tidyverse)
df %>%
pivot_longer(-c(Plot, stem_sum)) %>%
arrange(Plot, desc(value)) %>%
group_by(Plot) %>%
mutate(pct = value/stem_sum,
main_species = case_when(
pct > 0.8 & pct == max(pct) ~ name,
pct == max(pct) ~ "mixed"
)
) %>% ungroup() %>%
fill(main_species, .direction = "down") %>%
select(-pct) %>%
pivot_wider()
I have the following situation. Given the table
df <- data.frame(ID = c(1, 2, 2, 3, 3, 4),
type = c("MC", "MC", "MK", "MC", "MK", "MC"),
value1 = c(512, 261, 4523, 1004, 1221, 2556),
value2 = c(726, 4000, 280, 998, 113, 6789))
I am trying to find a way to implement the following logic: If for an ID, both types (MC and MK) occur, use value1 from MK and value2 from MC. Otherwise (only the type MC occurs), use MC.
Hence, the final result is supposed to be:
data.frame(ID = c(1, 2, 3, 4),
type = c("MC", "MC", "MC", "MC"),
value1 = c(512, 4523, 1221, 2556),
value2 = c(726, 4000, 998, 6789))
Assuming the type MK is dropped after extracting the value1.
Another version with dplyr
library(dplyr)
df %>%
group_by(ID) %>%
mutate(value1 = ifelse(any(type == "MK"), value1[type=="MK"],value1[type=="MC"]),
value2 = value2[type == "MC"]) %>%
filter(type == "MC")
# ID type value1 value2
# <dbl> <fct> <dbl> <dbl>
#1 1 MC 512 726
#2 2 MC 4523 4000
#3 3 MC 1221 998
#4 4 MC 2556 6789
Here, for value1 we check value in "MK" if it is present or take corresponding "MC" value instead and for value2 by default we take "MC" value and keep only rows with type "MC". This is assuming every group (ID) would have a "MC" type row.
For efficiency I would definitely prefer #Andre Elrico' answer but here is a dplyr option. Try:
df <- data.frame(ID = c(1, 2, 2, 3, 3, 4),
type = c("MC", "MC", "MK", "MC", "MK", "MC"),
value1 = c(512, 261, 4523, 1004, 1221, 2556),
value2 = c(726, 4000, 280, 998, 113, 6789))
library(dplyr)
df %>%
reshape(., idvar = "ID", timevar = "type", direction = "wide") %>%
group_by(ID) %>%
mutate(value1 = ifelse(is.na(value1.MK), value1.MC, value1.MK),
value2 = ifelse(is.na(value2.MC), value2.MK, value2.MC),
type = "MC") %>%
select(ID, type, value1, value2)
# output
# A tibble: 4 x 4
# Groups: ID [4]
ID type value1 value2
<dbl> <chr> <dbl> <dbl>
1 1 MC 512 726
2 2 MC 4523 4000
3 3 MC 1221 998
4 4 MC 2556 6789
data.table solution
setDT(df1)[,{x=.SD;if(all(c("MC","MK") %in% type)){x$value1[] = last(value1)};first(x)},by=ID]
result:
# ID type value1 value2
#1 1 MC 512 726
#2 2 MC 4523 4000
#3 3 MC 1221 998
#4 4 MC 2556 6789
dplyr:
df1 %>% group_by(ID) %>% do(.,(function(x){if(all(c("MC","MK") %in% x$type)){x$value1[] = x$value1[x$type=="MK"]};x[1,]})(.))
# A tibble: 4 x 4
# Groups: ID [4]
# ID type value1 value2
# <dbl> <fct> <dbl> <dbl>
#1 1 MC 512 726
#2 2 MC 4523 4000
#3 3 MC 1221 998
#4 4 MC 2556 6789