cumulative count of character vector - r

I want to make a cumulative count of country names from a data frame:
df <- data.frame(country = c("Sweden", "Germany", "Sweden", "Sweden", "Germany",
"Vietnam"), year= c(1834, 1846, 1847, 1852, 1860, 1865))
I have tried different version of count(), cumsum() and tally() but can’t seem to get it right.
Output should look like:
country year n
Sweden 1834 1
Germany 1846 2
Sweden 1847 2
Sweden 1852 2
Germany 1860 2
Vietnam 1865 3

df %>% mutate(count = cumsum(!duplicated(.$country))) %>% as_tibble()
#> # A tibble: 6 x 3
#> country year count
#> <fctr> <dbl> <int>
#> 1 Sweden 1834 1
#> 2 Germany 1846 2
#> 3 Sweden 1847 2
#> 4 Sweden 1852 2
#> 5 Germany 1860 2
#> 6 Vietnam 1865 3
or
dist_cum <- function(var)
sapply(seq_along(var), function(x) length(unique(head(var, x))))
df %>% mutate(var2=dist_cum(country))
#> country year var2
#> 1 Sweden 1834 1
#> 2 Germany 1846 2
#> 3 Sweden 1847 2
#> 4 Sweden 1852 2
#> 5 Germany 1860 2
#> 6 Vietnam 1865 3

You can try this:
library(ggplot2)
library(plyr)
df<-data.frame(country=c("Sweden","Germany","Sweden","Sweden","Germany","Vietnam", "Germany"),year= c(1834,1846,1847,1852,1860,1865,1860))
counts <- ddply(df, .(df$country, df$year), nrow)
The output is:
> counts
df$country df$year V1
1 Germany 1846 1
2 Germany 1860 2
3 Sweden 1834 1
4 Sweden 1847 1
5 Sweden 1852 1
6 Vietnam 1865 1

Related

How to add a ranking column for this dataset?

My data is as follows:
df <- data.frame(
comp_name = c("A","B","C","D","E","F","G","H","J","K","L","M"),
country = c("US", "UK", "France", "Germany", "US", "UK", "France", "Germany", "US", "UK", "France", "Germany"),
profit = c(100,125,150,165,150,110,110,125,130,250,95,100)
)
df:
comp_name country profit
1 A US 100
2 B UK 125
3 C France 150
4 D Germany 165
5 E US 150
6 F UK 110
7 G France 110
8 H Germany 125
9 J US 130
10 K UK 250
11 L France 95
12 M Germany 100
I would like to add a rank column to this data frame which ranks companies by profit by country, like this:
comp_name country profit rank
1 A US 100 3
2 B UK 125 2
3 C France 150 1
4 D Germany 165 1
5 E US 150 1
6 F UK 110 3
7 G France 110 2
8 H Germany 125 2
9 J US 130 2
10 K UK 250 1
11 L France 95 3
12 M Germany 100 3
I'm relatively new to R and don't know where to start with this. Any help would be greatly appreciated. Thanks!
Does this work:
library(dplyr)
df %>% group_by(country) %>% mutate(rank = rank(desc(profit)))
# A tibble: 12 x 4
# Groups: country [4]
comp_name country profit rank
<chr> <chr> <dbl> <dbl>
1 A US 100 3
2 B UK 125 2
3 C France 150 1
4 D Germany 165 1
5 E US 150 1
6 F UK 110 3
7 G France 110 2
8 H Germany 125 2
9 J US 130 2
10 K UK 250 1
11 L France 95 3
12 M Germany 100 3
An option with data.table
library(data.table)
setDT(df)[, Rank := frank(-profit), country]
A base R option using rank + ave
transform(
df,
Rank = ave(-profit, country, FUN = rank)
)
gives
comp_name country profit Rank
1 A US 100 3
2 B UK 125 2
3 C France 150 1
4 D Germany 165 1
5 E US 150 1
6 F UK 110 3
7 G France 110 2
8 H Germany 125 2
9 J US 130 2
10 K UK 250 1
11 L France 95 3
12 M Germany 100 3
df %>%
dplyr::group_by(country) %>%
dplyr::group_map(function(x, y){
x %>% dplyr::mutate(rank = rank(-profit))
}) %>%
dplyr::bind_rows()
Karthik S provided a cleaner answer.
Apparently, group_map here is redundant

How to add new multiple rows to data.frame on R?

Below is how my code and dataframe looks like.
#Get country counts
countries <- as.data.frame(table(na.omit(co_df$country)))
print(countries)
Var1 Freq
1 Austria 6
2 Canada 4
3 France 1
4 Germany 23
5 India 17
6 Italy 1
7 Russia 2
8 Sweden 1
9 UK 2
10 USA 10
I would like to add 4 new rows to the above countries data frame such that it looks like the below:
Var1 Freq
1 Austria 6
2 Canada 4
3 France 1
4 Germany 23
5 India 17
6 Italy 1
7 Russia 2
8 Sweden 1
9 UK 2
10 USA 10
11 Uruguay 25
12 Saudi Arabia 19
13 Japan 11
14 Australia 10
I performed the below rbind function but it gave me an error; I also tried merge(countries, Addcountries, by = Null) and the as.data.frame function but these too gave me errors.
Addcountries <- data.frame(c(11, 12, 13, 14), c("Uruguay", "Saudi Arabia", "Japan", "Australia"), c("25", "19", "11", "10"))
names(Addcountries) <- c("Var1", "Freq")
countries2 <- rbind(countries, Addcountries)
print(countries2)
This is likely a silly issue but I would appreciate any help here since I'm new to R.
you may also use dplyr::add_row()
countries %>% add_row(Var1 = c("Uruguay", "Saudi Arabia", "Japan", "Australia"),
Freq = c(25, 19, 11, 10))
check it
countries <- read.table(text = " Var1 Freq
Austria 6
Canada 4
France 1
Germany 23
India 17
Italy 1
Russia 2
Sweden 1
UK 2
USA 10", header =T)
countries %>% add_row(Var1 = c("Uruguay", "Saudi Arabia", "Japan", "Australia"),
Freq = c(25, 19, 11, 10))
Var1 Freq
1 Austria 6
2 Canada 4
3 France 1
4 Germany 23
5 India 17
6 Italy 1
7 Russia 2
8 Sweden 1
9 UK 2
10 USA 10
11 Uruguay 25
12 Saudi Arabia 19
13 Japan 11
14 Australia 10
Create a dataframe with two columns and rbind.
Addcountries <- data.frame(Var1 = c("Uruguay", "Saudi Arabia", "Japan", "Australia"),
Freq = c(25, 19, 11, 10), stringsAsFactors = FALSE)
countries2 <- rbind(countries, Addcountries)

Drop lines in a long dataset by group, based on some condition

I have this df:
library(lubridate)
Date <- c("2020-10-01", "2020-10-02", "2020-10-03", "2020-10-04",
"2020-10-01", "2020-10-02", "2020-10-03", "2020-10-04",
"2020-10-01", "2020-10-02", "2020-10-03", "2020-10-04")
Date <- as_date(Date)
Country <- c("USA", "USA", "USA", "USA",
"Mexico", "Mexico", "Mexico", "Mexico",
"Japan", "Japan", "Japan","Japan")
Value_A <- c(0,40,0,0,25,29,34,0,20,25,27,0)
df<- data.frame(Date, Country, Value_A)
view(df)
Date Country Value_A
<date> <chr> <dbl>
1 2020-10-01 USA 0
2 2020-10-02 USA 40
3 2020-10-03 USA 0
4 2020-10-04 USA 0
5 2020-10-01 Mexico 25
6 2020-10-02 Mexico 29
7 2020-10-03 Mexico 34
8 2020-10-04 Mexico 0
9 2020-10-01 Japan 20
10 2020-10-02 Japan 25
11 2020-10-03 Japan 27
12 2020-10-04 Japan 0
I'm trying to drop the rows containing zeros, but only if these zeros are in the last two rows of each group of the Country column. So the result would be:
Date Country Value_A
<date> <chr> <dbl>
1 2020-10-01 USA 0
2 2020-10-02 USA 40
5 2020-10-01 Mexico 25
6 2020-10-02 Mexico 29
7 2020-10-03 Mexico 34
9 2020-10-01 Japan 20
10 2020-10-02 Japan 25
11 2020-10-03 Japan 27
I appreciate it if someone can help :)
We can use the tidyverse package to do a few manipulations to get the result. We group_by Country, and sort descending by Date. After that, we generate row_numbers. Finally, we filter based on the condition you described:
library(tidyverse)
df %>%
group_by(Country) %>%
arrange(desc(Date)) %>%
mutate(rn = row_number()) %>%
filter(!(Value_A == 0 & rn <= 2))
# Date Country Value_A rn
# 1 2020-10-03 Mexico 34 2
# 2 2020-10-03 Japan 27 2
# 3 2020-10-02 USA 40 3
# 4 2020-10-02 Mexico 29 3
# 5 2020-10-02 Japan 25 3
# 6 2020-10-01 USA 0 4
# 7 2020-10-01 Mexico 25 4
# 8 2020-10-01 Japan 20 4
Another method would be to use rank(desc(Date))
library(tidyverse)
df %>%
group_by(Country) %>%
mutate(rank_date = rank(desc(Date))) %>%
filter(!(rank_date <= 2 & Value_A == 0))
# Date Country Value_A rank_date
# 1 2020-10-01 USA 0 4
# 2 2020-10-02 USA 40 3
# 3 2020-10-01 Mexico 25 4
# 4 2020-10-02 Mexico 29 3
# 5 2020-10-03 Mexico 34 2
# 6 2020-10-01 Japan 20 4
# 7 2020-10-02 Japan 25 3
# 8 2020-10-03 Japan 27 2

R - calculate annual population conditional on survival in every year

I have a data frame with three columns: birth_year, death_year, gender.
I have to calculate total alive male and female population for every year in a given range (1950:1980).
The data frame looks like this:
birth_year death_year gender
1934 1988 male
1922 1993 female
1890 1966 male
1901 1956 male
1946 2009 female
1909 1976 female
1899 1945 male
1887 1949 male
1902 1984 female
The person is alive in year x if death_year > x & birth year <= x
The output I am looking for is something like this:
year male female
1950 3 4
1951 2 3
1952 4 3
1953 4 5
.
.
1980 6 3
Thanks!
Does this work:
library(tidyr)
library(purrr)
library(dplyr)
df %>% mutate(year = map2(1950,1980, seq)) %>% unnest(year) %>%
mutate(isalive = case_when(year >= birth_year & year < death_year ~ 1, TRUE ~ 0)) %>%
group_by(year, gender) %>% summarise(alive = sum(isalive)) %>%
pivot_wider(names_from = gender, values_from = alive) %>% print( n = 50)
`summarise()` regrouping output by 'year' (override with `.groups` argument)
# A tibble: 31 x 3
# Groups: year [31]
year female male
<int> <dbl> <dbl>
1 1950 4 3
2 1951 4 3
3 1952 4 3
4 1953 4 3
5 1954 4 3
6 1955 4 3
7 1956 4 2
8 1957 4 2
9 1958 4 2
10 1959 4 2
11 1960 4 2
12 1961 4 2
13 1962 4 2
14 1963 4 2
15 1964 4 2
16 1965 4 2
17 1966 4 1
18 1967 4 1
19 1968 4 1
20 1969 4 1
21 1970 4 1
22 1971 4 1
23 1972 4 1
24 1973 4 1
25 1974 4 1
26 1975 4 1
27 1976 3 1
28 1977 3 1
29 1978 3 1
30 1979 3 1
31 1980 3 1
Data used:
df
# A tibble: 9 x 3
birth_year death_year gender
<dbl> <dbl> <chr>
1 1934 1988 male
2 1922 1993 female
3 1890 1966 male
4 1901 1956 male
5 1946 2009 female
6 1909 1976 female
7 1899 1945 male
8 1887 1949 male
9 1902 1984 female
Here's a simple base R solution. Summing a logical vector will get you your count of alive or dead because TRUE is 1 and FALSE is 0.
number_alive <- function(range, df){
sapply(range, function(x) sum((df$death_year > x) & (df$birth_year <= x)))
}
output <- data.frame('year' = 1950:1980,
'female' = number_alive(1950:1980, df[df$gender == 'female']),
'male' = number_alive(1950:1980, df[df$gender == 'male']))
# year female male
# 1 1950 4 3
# 2 1951 4 3
# 3 1952 4 3
# 4 1953 4 3
# 5 1954 4 3
# 6 1955 4 3
# 7 1956 4 2
# 8 1957 4 2
# 9 1958 4 2
# 10 1959 4 2
# 11 1960 4 2
# 12 1961 4 2
# 13 1962 4 2
# 14 1963 4 2
# 15 1964 4 2
# 16 1965 4 2
# 17 1966 4 1
# 18 1967 4 1
# 19 1968 4 1
# 20 1969 4 1
# 21 1970 4 1
# 22 1971 4 1
# 23 1972 4 1
# 24 1973 4 1
# 25 1974 4 1
# 26 1975 4 1
# 27 1976 3 1
# 28 1977 3 1
# 29 1978 3 1
# 30 1979 3 1
# 31 1980 3 1
This approach uses an ifelse to determine if alive (1) or dead (0).
Data:
df <- "birth_year death_year gender
1934 1988 male
1922 1993 female
1890 1966 male
1901 1956 male
1946 2009 female
1909 1976 female
1899 1945 male
1887 1949 male
1902 1984 female"
df <- read.table(text = df, header = TRUE)
Code:
library(dplyr)
library(tidyr)
library(tibble)
library(purrr)
df %>%
mutate(year = map2(1950,1980, seq)) %>%
unnest(year) %>%
select(year, birth_year, death_year, gender) %>%
mutate(
alive = ifelse(year >= birth_year & year <= death_year, 1, 0)
) %>%
group_by(year, gender) %>%
summarise(
is_alive = sum(alive)
) %>%
pivot_wider(
names_from = gender,
values_from = is_alive
) %>%
select(year, male, female)
Output:
#> # A tibble: 31 x 3
#> # Groups: year [31]
#> year male female
#> <int> <dbl> <dbl>
#> 1 1950 3 4
#> 2 1951 3 4
#> 3 1952 3 4
#> 4 1953 3 4
#> 5 1954 3 4
#> 6 1955 3 4
#> 7 1956 3 4
#> 8 1957 2 4
#> 9 1958 2 4
#> 10 1959 2 4
#> # … with 21 more rows
Created on 2020-11-11 by the reprex package (v0.3.0)

grouping data in R and summing by decade

I have the following dataset:
ireland england france year
5 3 2 1920
4 3 4 1921
6 2 1 1922
3 1 5 1930
2 5 2 1931
I need to summarise the data by 1920's and 1930's. So I need total points for ireland, england and france in the 1920-1922 and then another total point for ireland,england and france in 1930,1931.
Any ideas? I have tried but failed.
Dataset:
x <- read.table(text = "ireland england france
5 3 2 1920
4 3 4 1921
6 2 1 1922
3 1 5 1930
2 5 2 1931", header = T)
How about dividing the years by 10 and then summarizing?
library(dplyr)
x %>% mutate(decade = floor(year/10)*10) %>%
group_by(decade) %>%
summarize_all(sum) %>%
select(-year)
# A tibble: 2 x 5
# decade ireland england france
# <dbl> <int> <int> <int>
# 1 1920 15 8 7
# 2 1930 5 6 7
An R base solution
As A5C1D2H2I1M1N2O1R2T1 mentioned, you can use findIntervals() to set corresponding decade for each year and then, an aggregate() to group py decade
txt <-
"ireland england france year
5 3 2 1920
4 3 4 1921
6 2 1 1922
3 1 5 1930
2 5 2 1931"
df <- read.table(text=txt, header=T)
decades <- c(1920, 1930, 1940)
df$decade<- decades[findInterval(df$year, decades)]
aggregate(cbind(ireland,england,france) ~ decade , data = df, sum)
Output:
decade ireland england france
1 1920 15 8 7
2 1930 5 6 7

Resources