Combining data with Base R - r

I currently need to translate my dplyr code into base R code. My dplyr code gives me 3 columns, competitor sex, the olympic season and the number of different sports. The code looks like this:
olympics %>%
group_by(Sex, Season, Sport) %>%
summarise(n()) %>%
group_by(Sex, Season) %>%
summarise(n()) %>%
setNames(c("Competitor_Sex", "Olympic_Season", "Num_Sports"))
My data structure looks like this.
structure(list(Name = c("A Lamusi", "Juhamatti Tapio Aaltonen",
"Andreea Aanei", "Jamale (Djamel-) Aarrass (Ahrass-)", "Nstor Abad Sanjun",
"Nstor Abad Sanjun"), Sex = c("M", "M", "F", "M", "M", "M"),
Age = c(23L, 28L, 22L, 30L, 23L, 23L), Height = c(170L, 184L,
170L, 187L, 167L, 167L), Weight = c(60, 85, 125, 76, 64,
64), Team = c("China", "Finland", "Romania", "France", "Spain",
"Spain"), NOC = c("CHN", "FIN", "ROU", "FRA", "ESP", "ESP"
), Games = c("2012 Summer", "2014 Winter", "2016 Summer",
"2012 Summer", "2016 Summer", "2016 Summer"), Year = c(2012L,
2014L, 2016L, 2012L, 2016L, 2016L), Season = c("Summer",
"Winter", "Summer", "Summer", "Summer", "Summer"), City = c("London",
"Sochi", "Rio de Janeiro", "London", "Rio de Janeiro", "Rio de Janeiro"
), Sport = c("Judo", "Ice Hockey", "Weightlifting", "Athletics",
"Gymnastics", "Gymnastics"), Event = c("Judo Men's Extra-Lightweight",
"Ice Hockey Men's Ice Hockey", "Weightlifting Women's Super-Heavyweight",
"Athletics Men's 1,500 metres", "Gymnastics Men's Individual All-Around",
"Gymnastics Men's Floor Exercise"), Medal = c(NA, "Bronze",
NA, NA, NA, NA), BMI = c(20.7612456747405, 25.1063327032136,
43.2525951557093, 21.7335354170837, 22.9481157445588, 22.9481157445588
)), .Names = c("Name", "Sex", "Age", "Height", "Weight",
"Team", "NOC", "Games", "Year", "Season", "City", "Sport", "Event",
"Medal", "BMI"), row.names = c(NA, 6L), class = "data.frame")
Does anyone know how to translate this into base R?

Since you are grouping twice in dplyr you can use double aggregate in base R
setNames(aggregate(Name~Sex + Season,
aggregate(Name~Sex + Season + Sport, olympics, length), length),
c("Competitor_Sex", "Olympic_Season", "Num_Sports"))
# Competitor_Sex Olympic_Season Num_Sports
#1 F Summer 1
#2 M Summer 3
#3 M Winter 1
This gives the same output as dplyr option
library(dplyr)
olympics %>%
group_by(Sex, Season, Sport) %>%
summarise(n()) %>%
group_by(Sex, Season) %>%
summarise(n()) %>%
setNames(c("Competitor_Sex", "Olympic_Season", "Num_Sports"))
# Competitor_Sex Olympic_Season Num_Sports
# <chr> <chr> <int>
#1 F Summer 1
#2 M Summer 3
#3 M Winter 1

A base R option would be using aggregate twice
out <- aggregate(BMI ~ Sex + Season,
aggregate(BMI ~ Sex + Season + Sport, olympics, length), length)
names(out) <- c("Competitor_Sex", "Olympic_Season", "Num_Sports")
out
# Competitor_Sex Olympic_Season Num_Sports
#1 F Summer 1
#2 M Summer 3
#3 M Winter 1
It is similar to the OP's output
olympics %>%
group_by(Sex, Season, Sport) %>%
summarise(n()) %>%
group_by(Sex, Season) %>%
summarise(n()) %>%
setNames(c("Competitor_Sex", "Olympic_Season", "Num_Sports"))
# A tibble: 3 x 3
# Groups: Sex [2]
# Competitor_Sex Olympic_Season Num_Sports
# <chr> <chr> <int>
#1 F Summer 1
#2 M Summer 3
#3 M Winter 1
Or it can be done in a compact way with table from base R
table(sub(",[^,]+$", "", names(table(do.call(paste,
c(olympics[c("Sex", "Season", "Sport")], sep=","))))))
# F,Summer M,Summer M,Winter
# 1 3 1

Related

retain only rows and columns that match with a string vector

I have a large DF with certain columns that have a vector of character values as below. The number of columns varies from dataset to dataset as well as the number of character vectors it holds also varies.
ID Country1 Country2 Country3
1 1 Argentina, Japan,USA,Poland, Argentina,USA Pakistan
2 2 Colombia, Mexico,Uruguay,Dutch Mexico,Uruguay Afganisthan
3 3 Argentina, Japan,USA,NA Japan Khazagistan
4 4 Colombia, Mexico,Uruguay,Dutch Colombia, Dutch North Korea
5 5 India, China China Iran
Would like to match them one-to-one with another string vector as below
vals_to_find <-c("Argentina","USA","Mexico")
If, a column/row matches to anyone of the strings passed would like to retain that column and row. Remove duplicates, and finally remove those values that do not match.
the desired output is as follows
ID Countries.found
1 1 Argentina, USA
2 2 Mexico
3 3 Argentina, USA
4 4 Mexico
data
dput(df)
structure(list(ID = 1:5, Country1 = c("Argentina, Japan,USA,Poland,",
"Colombia, Mexico,Uruguay,Dutch", "Argentina, Japan,USA,NA",
"Colombia, Mexico,Uruguay,Dutch", "India, China"), Country2 = c("Argentina,USA",
"Mexico,Uruguay", "Japan", "Colombia, Dutch", "China"), Country3 = c("Pakistan",
"Afganisthan", "Khazagistan", "North Korea", "Iran")), class = "data.frame", row.names = c(NA,
-5L))
dput(df_out)
structure(list(ID = 1:4, Countries.found = c("Argentina, USA",
"Mexico", "Argentina, USA", "Mexico")), class = "data.frame", row.names = c(NA,
-4L))
Instead of a each column as a vector, if the file is read as one value per column. Then, was able do it as below
dput(df_out)
structure(list(ID = 1:5, X1 = c("Argentina", "Colombia", "Argentina",
"Colombia", "India"), X2 = c("Japan", "Mexico", "Japan", "Mexico",
"China"), X3 = c("USA", "Uruguay", "USA", "Uruguay", NA), X4 = c("Poland",
"Dutch", NA, "Dutch", NA), X5 = c("Argentina", "Mexico", "Japan",
"Colombia", "China"), X6 = c("USA", "Uruguay", NA, "Dutch", NA
), X7 = c("Pakistan", "Afganisthan", "Khazagistan", "North Korea",
"Iran")), class = "data.frame", row.names = c(NA, -5L))
df_out %>%
dplyr::select(
where(~ !all(is.na(.x)))
) %>%
dplyr::select(c(1, where(~ any(.x %in% vals_to_find)))) %>%
dplyr::mutate(dplyr::across(
tidyselect::starts_with("X"),
~ vals_to_find[match(., vals_to_find)]
)) %>%
tidyr::unite("countries_found", tidyselect::starts_with("X"),
sep = " | ", remove = TRUE, na.rm = TRUE
)
Output
ID countries_found
1 1 Argentina | USA | Argentina | USA
2 2 Mexico | Mexico
3 3 Argentina | USA
4 4 Mexico
unite the "Country" columns, then create a long vector by separating the values into rows, get all distinct values per ID, filter only those who are in vals_to_find, and summarise each countries.found toString.
library(tidyr)
library(dplyr)
df %>%
unite("Country", starts_with("Country"), sep = ",") %>%
separate_rows(Country) %>%
distinct(ID, Country) %>%
filter(Country %in% vals_to_find) %>%
group_by(ID) %>%
summarise(Countries.found = toString(Country))
output
# A tibble: 4 × 2
ID Countries.found
<int> <chr>
1 1 Argentina, USA
2 2 Mexico
3 3 Argentina, USA
4 4 Mexico
We may use
library(dplyr)
library(tidyr)
library(stringr)
df %>%
mutate(across(starts_with("Country"),
~ str_extract_all(.x, str_c(vals_to_find, collapse = "|")))) %>%
pivot_longer(cols = -ID, names_to = NULL,
values_to = 'Countries.found') %>%
unnest(Countries.found) %>%
distinct %>%
group_by(ID) %>%
summarise(Countries.found = toString(Countries.found))
-output
# A tibble: 4 × 2
ID Countries.found
<int> <chr>
1 1 Argentina, USA
2 2 Mexico
3 3 Argentina, USA
4 4 Mexico

How do I transpose observations into variables/columns?

I've got a data frame that looks something like this:
precinct, race, age, people
1001, black, 18-40, 1
1001, white, 18-40, 2
1001, hispanic, 18-40, 3
1001, asian, 18-40, 4
1001, black, 40 or older, 5
1001, white, 40 or older, 6
1001, hispanic, 40 or older, 7
1001, asian, 40 or older, 8
I want to make it look like this:
precinct, black, white, hispanic, asian, 18-40, 40 or older
1001, 6, 8, 10, 12, 10, 26
I've used dcast
dcast(
data = mydataframe,
formula = Precinct ~ race + age,
fun.aggregate = sum,
value.var = 'people'
)
but this does not produce my desired result.
When we create formula with + on the rhs of ~ it creates the combinations between those columns instead of having every single unique element from those columns. In order to have the latter, we may need to melt to long format and then use dcast on the single column (assuming those columns are of the same type)
library(data.table)
dcast(melt(setDT(mydataframe), id.var = c('precinct', 'people')),
precinct ~ value, fun.aggregate = sum, value.var = 'people')
-output
Key: <precinct>
precinct 18-40 40 or older asian black hispanic white
<int> <int> <int> <int> <int> <int> <int>
1: 1001 10 26 12 6 10 8
library(dplyr)
library(tidyr)
mydataframe %>%
pivot_longer(cols = c(race, age), names_to = NULL) %>%
pivot_wider(names_from = value, values_from = people, values_fn = sum)
-output
# A tibble: 1 × 7
precinct black `18-40` white hispanic asian `40 or older`
<int> <int> <int> <int> <int> <int> <int>
1 1001 6 10 8 10 12 26
data
mydataframe <- structure(list(precinct = c(1001L, 1001L, 1001L, 1001L, 1001L,
1001L, 1001L, 1001L), race = c("black", "white", "hispanic",
"asian", "black", "white", "hispanic", "asian"), age = c("18-40",
"18-40", "18-40", "18-40", "40 or older", "40 or older", "40 or older",
"40 or older"), people = 1:8), row.names = c(NA, -8L),
class = "data.frame")

How to create a new data frame with grouped transactions in R?

I am trying to create a new data frame in R using an existing data frame of items bought in transactions as shown below:
dput output for the data:
structure(list(Transaction = c(1L, 2L, 2L, 3L, 3L, 3L), Item = c("Bread",
"Scandinavian", "Scandinavian", "Hot chocolate", "Jam", "Cookies"
), date_time = c("30/10/2016 09:58", "30/10/2016 10:05", "30/10/2016 10:05",
"30/10/2016 10:07", "30/10/2016 10:07", "30/10/2016 10:07"),
period_day = c("morning", "morning", "morning", "morning",
"morning", "morning"), weekday_weekend = c("weekend", "weekend",
"weekend", "weekend", "weekend", "weekend"), Year = c("2016",
"2016", "2016", "2016", "2016", "2016"), Month = c("October",
"October", "October", "October", "October", "October")), row.names = c(NA,
6L), class = "data.frame")
As you can see in the example, the rows are due to each individual product bought, not the transactions themselves (hence why Transaction 2 is both rows 2 and 3).
I would like to make a new table where the rows are the different transactions (1, 2, 3, etc.) and the different columns are categorical (Bread = 0, 1) so I can perform apriori analysis.
Any idea how I can group the different transactions together and then create these new columns?
Assuming your dataframe is called df you can use tidyr's pivot_wider :
df1 <- tidyr::pivot_wider(df, names_from = Item, values_from = Item,
values_fn = n_distinct, values_fill = 0)
df1
# Transaction date_time period_day weekday_weekend Year Month Bread Scandinavian `Hot chocolate` Jam Cookies
# <int> <chr> <chr> <chr> <chr> <chr> <int> <int> <int> <int> <int>
#1 1 30/10/2016 09… morning weekend 2016 Octob… 1 0 0 0 0
#2 2 30/10/2016 10… morning weekend 2016 Octob… 0 1 0 0 0
#3 3 30/10/2016 10… morning weekend 2016 Octob… 0 0 1 1 1
Or with data.table's dcast :
library(data.table)
dcast(setDT(df), Transaction+date_time+period_day + weekday_weekend +
Year + Month ~ Item, value.var = 'Item', fun.aggregate = uniqueN)
Try dummy_cols from the fastDummies package. This will turn the item column into 0's and 1's. The second line sums per transaction.
d <- dummy_cols(data[1:2], remove_selected_column=T)
d <- aggregate(d[-1], by=list(Transaction=d$Transaction), FUN=sum)

How to include select 2-word phrases as tokens in tidytext?

I'm preprocessing some text data for further analysis. I tokenized the text using unnest_tokens() [into singular words] but want to keep certain commonly-occuring 2 word phrases such as "United States" or "social security." How can I do this using tidytext?
tidy_data <- data %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
dput(data[1:6, 1:6])
structure(list(race = c("US House", "US House", "US House", "US House",
"", "US House"), district = c(8L, 3L, 6L, 17L, 2L, 1L), party = c("Republican",
"Republican", "Republican", "Republican", "", "Republican"),
state = c("AZ", "AZ", "KY", "TX", "IL", "NH"), sponsor = c(4,
4, 4, 1, NA, 4), approve = structure(c(1L, 1L, 1L, 4L, NA,
1L), .Label = c("no oral statement of approval, authorization",
"beginning of the spot", "middle of the spot", "end of the spot"
), class = "factor")), row.names = c(NA, 6L), class = "data.frame")
If I were in this situation and I only had a short list of two-word phrases I need to keep in my analysis, I would do some prudent replacing before and after tokenization.
First, I would replace the two-word phrases with something that will stick together and not get broken apart by the tokenization process I'm using, like perhaps "united states" to "united_states".
library(tidyverse)
library(tidytext)
df <- tibble(text = c("I live in the United States",
"United we stand, divided we fall",
"Information security is important!",
"I work at the Social Security Administration"))
df_parsed <- df %>%
mutate(text = str_to_lower(text),
text = str_replace_all(text, "united states", "united_states"),
text = str_replace_all(text, "social security", "social_security"))
df_parsed
#> # A tibble: 4 x 1
#> text
#> <chr>
#> 1 i live in the united_states
#> 2 united we stand, divided we fall
#> 3 information security is important!
#> 4 i work at the social_security administration
Then you can tokenize like normal, and afterward, replace the things you just made with the two-word phrases again, so "united_states" back to "united states".
df_parsed %>%
unnest_tokens(word, text) %>%
mutate(word = case_when(word == "united_states" ~ "united states",
word == "social_security" ~ "social security",
TRUE ~ word))
#> # A tibble: 21 x 1
#> word
#> <chr>
#> 1 i
#> 2 live
#> 3 in
#> 4 the
#> 5 united states
#> 6 united
#> 7 we
#> 8 stand
#> 9 divided
#> 10 we
#> # … with 11 more rows
Created on 2019-08-03 by the reprex package (v0.3.0)
If you have a long list of these, it's going to get difficult and onerous, and then it might make sense to look at ways to use bigram and unigram tokenization. You can see an example of that here.

Addition of specific rows in a dataframe

I'm trying to add specific row of a data frame together.
And short of using grepl to find lines and then rbinding them to the bottom, I'm not sure if there's a better way to do this.
this is my input df:
input = structure(list(
V1 = c("Sales", "Sales", "Sales", "Sales", "Sales","Sales"),
V2 = c("Johnny", "Meg", "Fred", "Johnny", "Meg", "Fred"),
V3 = c("Australia", "Australia", "Australia", "NZ", "NZ","NZ"),
V4 = c(154L, 1898L, 175L, 1235L, 23L, 255L)), row.names = c(NA,6L),
class = "data.frame")
and this is my expected output:
structure(list(
V1 = c("Sales", "Sales", "Sales", "Sales", "Sales",
"Sales", "Sales", "Sales", "Sales", "Sales", "Sales", "Sales"),
V2 = c("Johnny", "Meg", "Fred", "Johnny", "Meg", "Fred", "Johnny + Fred",
"Meg + Fred", "Johnny + Meg + Fred", "Johnny + Fred", "Meg + Fred",
"Johnny + Meg + Fred"),
V3 = c("Australia", "Australia", "Australia", "NZ",
"NZ", "NZ", "Australia", "Australia", "Australia", "NZ", "NZ", "NZ"),
V4 = c(154L, 1898L, 175L, 1235L, 23L, 255L, 329L, 2073L, 2227L, 1490L, 278L, 1513L)),
class = "data.frame", row.names = c(NA, -12L)
)
I would've thought there's a better way to there's a better way of adding these rows that filtering and then adding, and then joining etc.
Can anyone point me in the right direction of what I should be looking for?
I solve the problem using combn
Data input part
input = structure(list(
V1 = c("Sales", "Sales", "Sales", "Sales", "Sales","Sales"),
V2 = c("Johnny", "Meg", "Fred", "Johnny", "Meg", "Fred"),
V3 = c("Australia", "Australia", "Australia", "NZ", "NZ","NZ"),
V4 = c(154L, 1898L, 175L, 1235L, 23L, 255L)), row.names = c(NA,6L),
class = "data.frame")
structure(list(
V1 = c("Sales", "Sales", "Sales", "Sales", "Sales",
"Sales", "Sales", "Sales", "Sales", "Sales", "Sales", "Sales"),
V2 = c("Johnny", "Meg", "Fred", "Johnny", "Meg", "Fred", "Johnny + Fred",
"Meg + Fred", "Johnny + Meg + Fred", "Johnny + Fred", "Meg + Fred",
"Johnny + Meg + Fred"),
V3 = c("Australia", "Australia", "Australia", "NZ",
"NZ", "NZ", "Australia", "Australia", "Australia", "NZ", "NZ", "NZ"),
V4 = c(154L, 1898L, 175L, 1235L, 23L, 255L, 329L, 2073L, 2227L, 1490L, 278L, 1513L)),
class = "data.frame", row.names = c(NA, -12L)
)
Solution
library(dplyr)
TT = unique(input$V2)
> TT
[1] "Johnny" "Meg" "Fred"
comb2 = combn(TT,2,simplify = FALSE)
> comb2
[[1]]
[1] "Johnny" "Meg"
[[2]]
[1] "Johnny" "Fred"
[[3]]
[1] "Meg" "Fred"
comb3 = combn(TT,3,simplify = FALSE)
> comb3
[[1]]
[1] "Johnny" "Meg" "Fred"
result = function(data){
purrr::map_df(lapply(data,function(x){paste(x,collapse = '|')}), function(x){
df = input[grepl(x,input$V2),] %>% group_by(V3)%>%summarize(V1= 'Sales',
V2= paste(V2,collapse = '+'),
V4= sum(V4))
return(df)
}
)
}
Result
result(comb2)
# A tibble: 6 x 4
V3 V1 V2 V4
<chr> <chr> <chr> <int>
1 Australia Sales Johnny+Meg 2052
2 NZ Sales Johnny+Meg 1258
3 Australia Sales Johnny+Fred 329
4 NZ Sales Johnny+Fred 1490
5 Australia Sales Meg+Fred 2073
6 NZ Sales Meg+Fred 278
result(comb3)
# A tibble: 2 x 4
V3 V1 V2 V4
<chr> <chr> <chr> <int>
1 Australia Sales Johnny+Meg+Fred 2227
2 NZ Sales Johnny+Meg+Fred 1513
finalResult = bind_rows(A,B,input) %>%
select(V1,V2,V3,V4) %>% filter(! V2 %in% c('Johnny+Meg'))
> finalResult
# A tibble: 12 x 4
V1 V2 V3 V4
<chr> <chr> <chr> <int>
1 Sales Johnny+Fred Australia 329
2 Sales Johnny+Fred NZ 1490
3 Sales Meg+Fred Australia 2073
4 Sales Meg+Fred NZ 278
5 Sales Johnny+Meg+Fred Australia 2227
6 Sales Johnny+Meg+Fred NZ 1513
7 Sales Johnny Australia 154
8 Sales Meg Australia 1898
9 Sales Fred Australia 175
10 Sales Johnny NZ 1235
11 Sales Meg NZ 23
12 Sales Fred NZ 255
Using tidyverse we can first split the dataframe based on V3 then create combination of names and add sum to create a new tibble and bind it to the original dataframe.
library(tidyverse)
input %>%
bind_rows(input %>%
group_split(V3) %>%
map_dfr(function(x) map_dfr(2:nrow(x), ~tibble(
V1 = first(x$V1),
V2 = combn(x$V2, ., paste, collapse = " + "),
V3 = first(x$V3),
V4 = combn(x$V4, .,sum)) %>%
filter(grepl("\\bFred\\b", V2)))))
# V1 V2 V3 V4
#1 Sales Johnny Australia 154
#2 Sales Meg Australia 1898
#3 Sales Fred Australia 175
#4 Sales Johnny NZ 1235
#5 Sales Meg NZ 23
#6 Sales Fred NZ 255
#7 Sales Johnny + Fred Australia 329
#8 Sales Meg + Fred Australia 2073
#9 Sales Johnny + Meg + Fred Australia 2227
#10 Sales Johnny + Fred NZ 1490
#11 Sales Meg + Fred NZ 278
#12 Sales Johnny + Meg + Fred NZ 1513
Using the same logic but in base R, we can do
rbind(input, do.call(rbind, lapply(split(input, input$V3), function(x)
do.call(rbind, lapply(2:nrow(x), function(y)
subset(data.frame(V1 = x$V1[1],
V2 = combn(x$V2, y, paste, collapse = " + "),
V3 = x$V3[1],
V4 = combn(x$V4, y, sum)),
grepl("\\bFred\\b", V2)))))))

Resources