How to replace this code in r by a function? - r

I have this code that works but I would like to learn how to replace it by a function.
library(tidyverse)
l1_1617 <- read.csv("http://www.football-data.co.uk/mmz4281/1617/F1.csv", stringsAsFactors = FALSE)
l1_1516 <- read.csv("http://www.football-data.co.uk/mmz4281/1516/F1.csv", stringsAsFactors = FALSE)
l1_1415 <- read.csv("http://www.football-data.co.uk/mmz4281/1415/F1.csv", stringsAsFactors = FALSE)
l1_1314 <- read.csv("http://www.football-data.co.uk/mmz4281/1314/F1.csv", stringsAsFactors = FALSE)
l1_1617_sel <- l1_1617 %>%
select(Date:AST) %>%
mutate(season = 1617)
l1_1516_sel <- l1_1516 %>%
select(Date:AST) %>%
mutate(season = 1516)
l1_1415_sel <- l1_1415 %>%
select(Date:AST) %>%
mutate(season = 1415)
l1_1314_sel <- l1_1314 %>%
select(Date:AST) %>%
mutate(season = 1314)
l1_1317 <- bind_rows(l1_1617_sel, l1_1516_sel, l1_1415_sel, l1_1314_sel)
For the first step I have tried something like this but it obviously failed:
dl_l1 <-function(x){
df_x <- read.csv("http://www.football-data.co.uk/mmz4281/x/F1.csv", stringsAsFactors = FALSE)
}
dl_l1(1617)

You need to use paste to concatenate to build the url. Below code should work.
dl_l1 <-function(x){
read.csv(paste0("http://www.football-data.co.uk/mmz4281/",x,"/F1.csv"), stringsAsFactors = FALSE) %>%
select(Date:AST) %>%
mutate(season = x)
}
dl_l1(1617)
#final output
l1_1317 <- bind_rows(dl_l1(1617), dl_l1(1516), dl_l1(1415), dl_l1(1314))

library(tidyverse)
ids <- as.character(c(1617, 1516, 1415, 1314))
data <- lapply(ids, function(i) {
read.csv(paste0("http://www.football-data.co.uk/mmz4281/", i ,"/F1.csv"), stringsAsFactors = FALSE) %>%
select(Date:AST) %>%
mutate(season = i)
})
data <- do.call(rbind, data)

I would create a for loop in a function so you can iterate through a vector of numbers:
create function football that takes a number, or a vector of numbers, then create an empty data.frame. for each number in the vector, you want to paste it into the url, and then mutate the year into that df. Then you bind_rows into the df. At the end you return the football_df which is the bind_rows version of all of the ones combined.
library(dplyr)
football <- function(numbers){
football_df <- data.frame()
for (i in seq_along(numbers)){
df <- read.csv(paste("http://www.football-data.co.uk/mmz4281/",numbers[i],"/F1.csv", sep=""), stringsAsFactors = FALSE) %>%
mutate(year = numbers[i])
football_df <- bind_rows(football_df, df)
}
return(football_df)
}
years <- c(1617, 1415, 1314)
final_df <- football(years)

Related

Optimizing large db merge using split() function

I need to perform a conceptually straightforward double left-merge followed by a simple series of matching functions (See: Straightforward Solution). However, given the DBs I have to merge are large in size I tried to unpack the merging procedure by considering a for-loop that does the trick but is inefficient to say the least (See: For-loops Solution).
Is there a solution splitting and naming at least the largest db?
Below there is a toy example.
For reference, in my data:
db_m1 ~50k lines (for ~5k unique m1)
db_m2 ~25k lines (for ~5k unique m1 and m2)
db_p ~100m lines
set.seed(0)
db_m1 <- data.frame(
y=rep(1,20),
id=sort(rep(paste0("id_",c(letters[1:4])),5)),
m1=rep(c(1,2),10),
x1=sample(LETTERS, 20, TRUE),
x2=sample(LETTERS, 20, TRUE))
set.seed(0)
db_m2 <- data.frame(y=rep(1,20),
m1=sample(c(1:5),20,TRUE),
m2=sample(c(6:10),20,TRUE))
set.seed(0)
db_p <- data.frame(m2=sample(c(6:10),100,TRUE),
y1=sample(LETTERS, 100,TRUE),
y2=sample(LETTERS, 10,TRUE))
Straightforward Solution :
final_dplyr <- db_m1 %>%
dplyr::left_join(db_m2) %>%
dplyr::left_join(db_p) %>%
dplyr::mutate(match_1=ifelse(x1==y1|x1==y2,1,0),
match_2=ifelse(x2==y1|x2==y2,1,0),
sum_matches=mapply(sum,match_1,match_2),
final_1 = ifelse(as.numeric(sum_matches)>=1,1,0),
final_2 = ifelse(as.numeric(sum_matches)>=2,1,0)) %>%
group_by(id,m2) %>%
dplyr::mutate(n_p=n(),
n_p=ifelse(all(is.na(y1)),NA,n_p)) %>%
group_by(y,id,m1,m2,n_p) %>%
dplyr::summarise(match_1=sum(match_1,na.rm = T),
match_2=sum(match_2,na.rm = T),
final_1 = sum(final_1),
final_2 = sum(final_2))
For-loops Solution:
fn_final <- function(db_m1,db_m2,db_p) {
matches_final <- vector("list",length = length(unique(db_m1$y)))
for(i in 1:length(unique(db_m1$y))){
matches <- vector("list",length = length(unique(db_m1$m1)))
for(j in 1:length(unique(db_m1$m1))){
temp_db_m1 <- db_m1 %>% dplyr::filter(y==unique(db_m1$y)[i], m1==unique(db_m1$m1)[j])
temp_db_m2 <- db_m2 %>% dplyr::filter(y==unique(db_m1$y)[i], m1==unique(db_m1$m1)[j])
m_vector <- unique(temp_db_m2$m2)
temp_db_p <- db_p %>%
dplyr::filter(m2 %in% m_vector)
final <- db_m1 %>%
dplyr::left_join(db_m2) %>%
dplyr::left_join(db_p) %>% dplyr::mutate(match_1=ifelse(x1==y1|x1==y2,1,0),
match_2=ifelse(x2==y1|x2==y2,1,0),
sum_matches=mapply(sum,match_1,match_2),
final_1 = ifelse(as.numeric(sum_matches)>=1,1,0),
final_2 = ifelse(as.numeric(sum_matches)>=2,1,0)) %>%
group_by(id,m2) %>%
dplyr::mutate(n_p=n(),
n_p=ifelse(all(is.na(y1)),NA,n_p)) %>%
group_by(y,id,m1,m2,n_p) %>%
dplyr::summarise(match_1=sum(match_1,na.rm = T),
match_2=sum(match_2,na.rm = T),
final_1 = sum(final_1),
final_2 = sum(final_2))
matches[[j]] <- final
}
matches_all <- do.call(rbind, matches)
matches_final[[i]] <- matches_all
}
final <- do.call(rbind, matches_final) %>%
dplyr::filter(!is.na(n_p)) %>%
unique()
return(final)
}
final_for <- fn_final(db_m1,db_m2,db_p)
This is a possible solution, should it be optimized further?
db_m1_s <- split(db_m1, f = list(db_m1$y,db_m1$m1))
db_m2_s <- split(db_m2, f = list(db_m2$y,db_m2$m1))
db_p_s <- split(db_p, f = list(db_p$m2))
match_fn <- function(temp_db_m1,temp_db_m2,temp_db_p){
final <- temp_db_m1 %>%
dplyr::left_join(temp_db_m2) %>%
dplyr::left_join(temp_db_p) %>%
dplyr::mutate(match_1=ifelse(x1==y1|x1==y2,1,0),
match_2=ifelse(x2==y1|x2==y2,1,0),
sum_matches=mapply(sum,match_1,match_2),
final_1 = ifelse(as.numeric(sum_matches)>=1,1,0),
final_2 = ifelse(as.numeric(sum_matches)>=2,1,0)) %>%
group_by(id,m2) %>%
dplyr::mutate(n_p=n(),
n_p=ifelse(all(is.na(y1)),NA,n_p)) %>%
group_by(y,id,m1,m2,n_p) %>%
dplyr::summarise(match_1=sum(match_1,na.rm = T),
match_2=sum(match_2,na.rm = T),
final_1 = sum(final_1),
final_2 = sum(final_2))
return(final)
}
fn_final <- function(db_m1,db_m1_s,db_m2_s,db_p_s) {
m <- names(db_m1_s)
matches_1 <- vector("list",length = length(m))
for(i in 1:length(m)){
temp_db_m1 <- db_m1_s[[m[i]]]
temp_db_m2 <- db_m2_s[[m[i]]]
n <- as.character(sort(unique(temp_db_m2$m2)))
matches_2 <- vector("list",length = length(n))
for(j in 1:length(n)){
temp_db_p <- db_p_s[[n[j]]]
final <- match_fn(temp_db_m1,temp_db_m2,temp_db_p)
matches_2[[j]] <- final
}
matches_all <- do.call(rbind, matches_2)
matches_1[[i]] <- matches_all
}
matches_0 <- do.call(rbind, matches_1) %>%
dplyr::filter(!is.na(n_p)) %>%
unique()
return(matches_0)
}
final_for <- fn_final(db_m1,db_m1_s,db_m2_s,db_p_s)

Automate script by changing str_detect?

I am using this script to produce a table. In the second line, for collectionName, I am using "Organization X". I have many different organizations (Org Y, Org Z, .....) to create this table for. Is there a way to automate this? And to automate the naming of the object (currently "orgx" below)?
orgx <- df %>%
filter(str_detect(collectionName, c("Organization X"))) %>%
filter(str_detect(Year, paste(years, collapse = "|"))) %>%
corpus(text_field = "text") %>%
tokens(remove_punct = TRUE) %>%
tokens_select(stopwords('english'),selection='remove') %>%
tokens_tolower(keep_acronyms = FALSE) %>%
tokens_lookup(dictionary = dict, nomatch = TRUE) %>%
dfm() %>%
dfm_group(groups = "Title") %>%
dfm_weight(scheme = "prop") %>%
as.data.frame() %>%
mutate_at(vars(keyterms, true), funs(round(., 4)))
Get the column names specific to that organizations as vector, use that as pattern in str_detect by looping over the vector in map and return the output in a list
library(dplyr)
library(purrr)
library(stringr)
vec <- c("Organization X", "Organization Y")
out <- map(vec, ~
df %>%
filter(str_detect(collectionName, .x)) %>%
filter(str_detect(Year, paste(years, collapse = "|"))) %>%
corpus(text_field = "text") %>%
tokens(remove_punct = TRUE) %>%
tokens_select(stopwords('english'),selection='remove') %>%
tokens_tolower(keep_acronyms = FALSE) %>%
tokens_lookup(dictionary = dict, nomatch = TRUE) %>%
dfm() %>%
dfm_group(groups = "Title") %>%
dfm_weight(scheme = "prop") %>%
as.data.frame() %>%
mutate_at(vars(keyterms, true), funs(round(., 4)))
)
names(out) <- sub("^(...).*\\s+(\\S)$", "\\1\\2", vec)
It may be better to keep the output in a list. But, if we need to assign it to different objects, it can be done with list2env or assign
list2env(out, .GlobalEnv)

How to wirte a loop to repeat entire block code in r scripts?

I want to import 15 different datasets and clean them up. Raw dataset names are like C1_1, C2_1, C3_1 ... C15_1.
My code is as follows for the first dataset:
dataC1_1 <- read.delim("C1_1.txt",header = FALSE)
dataC1_1 <- dataC1_1[-1,-c(1,4,8:11)]
dataC1_1 <- na.omit(dataC1_1)
dataC1_1 <- dataC1_1[!(dataC1_1$V3=="Experiment"),]
dataC1_1 <- dataC1_1[!(dataC1_1$V5=="Key: Return"),]
dataC1_1 <- dataC1_1[order(dataC1_1$V6),]
dataC1_1$q_id <- strrep(c("q1","q2","q3","q4"),times = 1)
dataC1_1$response <- dataC1_1$V5 %>% str_match_all("[0-9]+") %>% unlist %>% as.numeric
dataC1_1 <- dataC1_1[,-c(1,3,4)]
dataC1_1 <- setnames(dataC1_1,c("ad_id","rt","q_id","response"))
dataC1_1$id <- rep("C1",length(dataC1_1$q_id))
I have tried so many times with while loop and if loop, but I just could not repeat 15 times.
Anyone could help me out?
Thanks!
Create a function to apply on each of the datasets while reading the datasets in a loop
library(readr)
librar(dplyr)
library(stringr)
map2(sprintf("C%d_1", 1:15), str_c("C", 1:5), f1)
where
f1 <- function(nm, id) {
read_csv(nm) %>%
select(-c(1, 4, 8:11)) %>%
slice(-1) %>%
na.omit %>%
filter(V3 != "Experiment"| V5 != "Key: Return") %>%
arrange(V6) %>%
mutate(q_id = strrep(c("q1","q2","q3","q4"),times = 1),
response = str_match(V5,("[0-9]+") %>% unlist %>% as.numeric) %>%
select(-c(1, 3, 4)) %>%
set_names(c("ad_id","rt","q_id","response")) %>%
mutate(id = id)
}

Trying to understand how eval(expr, envir = df) works

I have built a function which seems to work, but I don't understand why.
My initial problem was to take a data.frame which contains counts of a population and expand it to re-create the original population. This is easy enough if you know the column names in advance.
library(tidyverse)
set.seed(121)
test_counts <- tibble(Population = letters[1:4], Length = c(1,1,2,1),
Number = sample(1:100, 4))
expand_counts_v0 <- function(Length, Population, Number) {
tibble(Population = Population,
Length = rep(Length, times = Number))
}
test_counts %>% pmap_dfr(expand_counts_v0) %>% # apply it
group_by(Population, Length) %>% # test it
summarise(Number = n()) %>%
ungroup %>%
{ all.equal(., test_counts)}
# [1] TRUE
However, I wanted to generalise it to a function which didn't need to know at the column names of the data.frame, and I'm interested in NSE, so I wrote:
test_counts1 <- tibble(Population = letters[1:4],
Length = c(1,1,2,1),
Number = sample(1:100, 4),
Height = c(100, 50, 45, 90),
Width = c(700, 50, 60, 90)
)
expand_counts_v1 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
make_tbl <- function(...) {
expr(tibble(!!!cols)) %>% eval(envir = df)
}
df %>% pmap_dfr(make_tbl)
}
But, when I test this function it seems to duplicate rows 4 times:
test_counts %>% expand_counts_v1(count = Number) %>%
group_by(Population, Length) %>%
summarise(Number = n()) %>%
ungroup %>%
{ sum(.$Number)/sum(test_counts$Number)}
# [1] 4
This lead me to guess a solution, which was
expand_counts_v2 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
make_tbl <- function(...) {
expr(tibble(!!!cols)) %>% eval(envir = df)
}
df %>% make_tbl
}
This seems to work:
test_counts %>% expand_counts_v2(count = Number) %>%
group_by(Population, Length) %>%
summarise(Number = n()) %>%
ungroup %>%
{ all.equal(., test_counts)}
# [1] TRUE
test_counts1 %>% expand_counts_v2(count = Number) %>%
group_by(Population, Length, Height, Width) %>%
summarise(Number = n()) %>%
ungroup %>%
{ all.equal(., test_counts1)}
# [1] TRUE
But I don't understand why. How is it evaluating for each row, even though I'm not using pmap anymore? The function needs to be applied to each row in order to work, so it must be somehow, but I can't see how it's doing that.
EDIT
After Artem's correct explanation of what was going on, I realised I could do this
expand_counts_v2 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
expr(tibble(!!!cols)) %>% eval_tidy(data = df)
}
Which gets rid of the unnecessary mk_tbl function. However, as Artem said, that is only really working because rep is vectorised. So, it's working, but not by re-writing the _v0 function and pmapping it, which is the process I was trying to replicate. Eventually, I discovered, rlang::new_function and wrote:
expand_counts_v3 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
all_names <- df %>% names %>% map(as.name)
args <- rep(0, times = length(all_names)) %>% as.list %>% set_names(all_names)
correct_function <- new_function(args, # this makes the function as in _v0
expr(tibble(!!!cols)) )
pmap_dfr(df, correct_function) # applies it as in _v0
}
which is longer, and probably uglier, but works the way I originally wanted.
The issue is in eval( envir = df ), which exposes the entire data frame to make_tbl(). Notice that you never use ... argument inside make_tbl(). Instead, the function effectively computes the equivalent of
with( df, tibble(Population = rep(Population, times = Number),
Length = rep(Length, times=Number)) )
regardless of what arguments you provide to it. When you call the function via pmap_dfr(), it essentially computes the above four times (once for each row) and concatenates the results by-row, resulting in the duplication of entries you've observed. When you remove pmap_dfr(), the function is called once, but since rep is itself vectorized (try doing rep( test_counts$Population, test_counts$Number ) to see what I mean), make_tbl() computes the entire result in one go.

For Loop Ending Early - Why?

I have a loop that should run about 300,000 times, but it ends at 55 when I bind data to a data frame and I have no clue what is happening.
The loop in question is:
TrendlineMeta <- data.frame("FutureRecord" = character(), "System" = numeric(), "Intercepts" = numeric(), "Slopes" = numeric(), stringsAsFactors = FALSE)
for (i in unique(TrendingData$FutureRecord)){
FilteredList <- TrendingData[TrendingData$FutureRecord == i,]
Regressed <- lm(FilteredList$Value ~ FilteredList$Time)#, na.action = na.omit)
newrow <- c("FutureRecord"=j, "System"=max(as.character(FilteredList$System)), "Intercepts"=summary(Regressed)$coefficients[1,1], "Slopes"=summary(Regressed)$coefficients[2,1])
TrendlineMeta <- rbind(TrendlineMeta, data.frame(as.list(newrow), stringsAsFactors = FALSE))
}
and ends after 55 itterations.
However, this loop:
TrendlineMeta <- data.frame("FutureRecord" = character(), "System" = numeric(), "Intercepts" = numeric(), "Slopes" = numeric(), stringsAsFactors = FALSE)
for (i in unique(TrendingData$FutureRecord)){
FilteredList <- TrendingData[TrendingData$FutureRecord == i,]
Regressed <- lm(FilteredList$Value ~ FilteredList$Time)#, na.action = na.omit)
#newrow <- c("FutureRecord"=j, "System"=max(as.character(FilteredList$System)), "Intercepts"=summary(Regressed)$coefficients[1,1], "Slopes"=summary(Regressed)$coefficients[2,1])
#TrendlineMeta <- rbind(TrendlineMeta, data.frame(as.list(newrow), stringsAsFactors = FALSE))
}
completes fine.
What about this am I doing wrong? I am new to R so nothing is jumping out at me.
So, this is just a stab at your issue, but it's a little difficult without seeing the underlying dataset. I'm using Hadley's purrr, tidyr, plyr and dplyr packages.
It may accomplish what you're trying to do without using the loop.
partA <- TrendingData %>%
split(.$FutureRecord) %>%
map(~ lm(Value ~ Time, data = .)) %>%
map(summary) %>%
map("coefficients") %>%
map(data.frame) %>%
map(~ select(.x, Estimate) %>%
mutate(coef = row.names(.))) %>%
ldply(rbind) %>%
rename(FutureRecord = .id) %>%
spread(coef, Estimate)
From here,
partB <- TrendingData %>%
select(FutureRecord, System) %>%
group_by(FutureRecord) %>%
filter(System == max(System)) %>%
ungroup
Then,
left_join(partA, partB)
Does that work?

Resources