How do I parameterize the column names in dplyr function calls - r

Suppose we have the following setup
library(dplyr)
set.seed(10101)
id <- sample(3,20,replace = TRUE)
x <- sample(2,20,replace = TRUE)
df <- data.frame(id,x)
How do I parameterize the following:
df %>% group_by(id) %>% arrange(id) %>% mutate(x.lag=lag(x,1,default=0))
cl <- "x"
cl.lag <- "x.lag.1"
my naive attempt does not seem to work:
df %>% group_by(id) %>% arrange(id) %>% mutate(cl.lag=lag(cl,1,default=0))

Related

Merging datasets in R results in incomplete string values

I wanted to merge two datasets and the merging resulted in getting incomplete strings in a variable
# Librerias
```{r}
library(dplyr)
library(tidyr)
library(tidyverse)
library(readxl)
library(readr)
library(base)
library(stringr)
library(foreign)
library(forcats)
library(fs)
library(hablar)
library(openxlsx)
```
# Comercio CIIU
```{r}
# Mercancias
Catalogo_comercio_CIIU <- read_excel("Comercio/Catalogo comercio bienes CIIU.xlsx") %>%
mutate(ACTIV4=CIIU)
CIIU_Exportaciones <- read_excel("Comercio/CIIU Exportaciones.xlsx") %>%
pivot_longer(cols = -...1, names_to = "Period", values_to = "Valor") %>%
mutate(Flujo="Exportaciones")
CIIU_Importaciones <- read_excel("Comercio/CIIU Importaciones.xlsx") %>%
pivot_longer(cols = -...1, names_to = "Period", values_to = "Valor") %>%
mutate(Flujo="Importaciones")
Length<- CIIU_Exportaciones %>% group_by(...1) %>%
summarise(Obs=n())
Length<- Length$Obs[[1]] %>%
as.numeric() %>%
as.vector()
# Mensual
Comercio_bienes_mensual <- rbind(CIIU_Importaciones, CIIU_Exportaciones) %>%
rename(Actividad="...1") %>% group_by(Flujo, Actividad) %>%
mutate(Fecha=seq(from=as.Date("1994-01-01"), by="month", length.out=Length)) %>%
mutate(Year=str_sub(Fecha, 1L,4L), Mes=str_sub(Fecha, 6L,7L)) %>%
group_by(Flujo, Actividad, Year) %>%
mutate(Acumulado=cumsum(Valor)) %>%
group_by(Flujo, Actividad) %>%
mutate( C_acumulado= Acumulado-lag(Acumulado, n=12L), TC_acumulado=Acumulado/lag(Acumulado, n=12L)-1 ) %>%
merge(Catalogo_comercio_CIIU, by="Actividad") %>%
select(-Grupo,-Detalle,-Categoria1,-Categoria2)
```
This is the catalogue I want to merge
This is how the merging turned out (incomplete string)
I am a beginner so I don't really know what to try

Create R function from multiple scripts?

Not too good with functions. Is there a way to write the below script as a function? I have a list of dataframes that I want to apply the below scripts to.
head(iris)
iris1 <- iris %>%
group_by(Species) %>%
mutate_at(vars(Petal.Length), ~replace_na(., 0)) %>%
summarise(Petal.Length = sum(Petal.Length))
iris2 <- iris %>%
group_by(Species) %>%
tally()
iris3 <- iris2 %>%
inner_join(iris1)
iris3$average <- iris3$Petal.Length/iris3$n
Yes, its quite easy.
Let me know if this helps you:
my_function_name <- function(df){
table1 <- df %>%
group_by(org) %>%
tally()
table2 <- df %>%
group_by(org) %>%
mutate_at(vars(hours), ~replace_na(., 0)) %>%
summarise(hours = sum(hours))
table3 <- table1 %>%
inner_join(table2)
table3$average <- table3$hours/table3$n
return(list(table1,table2,table3))
}
# Calling the function
results <- my_function_name(df)
results$table1
results$table2
results$table3
In this case I used the function to retrieve all the tables. If you only want the final number table3$hours/table3$n what we can do is change the return of the function:
my_function_name <- function(df){
table1 <- df %>%
group_by(org) %>%
tally()
table2 <- df %>%
group_by(org) %>%
mutate_at(vars(hours), ~replace_na(., 0)) %>%
summarise(hours = sum(hours))
table3 <- table1 %>%
inner_join(table2)
table3$average <- table3$hours/table3$n
return(table3$average)
}
# Calling the function
results <- my_function_name(df)
results

add summary `n` from one dataframe to another dataframe (tidyverse)

I was wondering if there might be a way to replace the column fpc in DATA2 with corresponding fpc obtained from DATA1?
library(tidyverse)
dat <- read.csv('https://raw.githubusercontent.com/rnorouzian/d/master/su.csv')
## 10000 rows ################
DATA1 <- dat %>%
group_by(across(all_of(c("gender", "pre")))) %>%
summarise(n = n(), .groups = 'drop') %>%
mutate(fpc = n/sum(n)) %>%
right_join(dat)
dat2 <- read.csv('https://raw.githubusercontent.com/rnorouzian/d/master/out.csv')
## 200 rows #################
DATA2 <- dat2 %>%
group_by(across(all_of(c("gender", "pre")))) %>%
summarise(n = n(), .groups = 'drop') %>%
mutate(fpc = n/sum(n)) %>%
right_join(dat2)
You can join the dataframe and use coalesce to select fpc from DATA2.
library(dplyr)
result <- DATA2 %>%
left_join(DATA1 %>% distinct(gender, pre, fpc),
by = c('gender', 'pre')) %>%
mutate(fpc = coalesce(fpc.y, fpc.x)) %>%
select(names(DATA2))
nrow(result)
#[1] 200
It would be more efficient to do this in data.table
library(data.table)
setDT(DATA2)[as.data.table(unique(DATA1[c('gender', 'pre', 'fpc')])),
fpc := i.fpc, on = .(gender, pre)]

How to wirte a loop to repeat entire block code in r scripts?

I want to import 15 different datasets and clean them up. Raw dataset names are like C1_1, C2_1, C3_1 ... C15_1.
My code is as follows for the first dataset:
dataC1_1 <- read.delim("C1_1.txt",header = FALSE)
dataC1_1 <- dataC1_1[-1,-c(1,4,8:11)]
dataC1_1 <- na.omit(dataC1_1)
dataC1_1 <- dataC1_1[!(dataC1_1$V3=="Experiment"),]
dataC1_1 <- dataC1_1[!(dataC1_1$V5=="Key: Return"),]
dataC1_1 <- dataC1_1[order(dataC1_1$V6),]
dataC1_1$q_id <- strrep(c("q1","q2","q3","q4"),times = 1)
dataC1_1$response <- dataC1_1$V5 %>% str_match_all("[0-9]+") %>% unlist %>% as.numeric
dataC1_1 <- dataC1_1[,-c(1,3,4)]
dataC1_1 <- setnames(dataC1_1,c("ad_id","rt","q_id","response"))
dataC1_1$id <- rep("C1",length(dataC1_1$q_id))
I have tried so many times with while loop and if loop, but I just could not repeat 15 times.
Anyone could help me out?
Thanks!
Create a function to apply on each of the datasets while reading the datasets in a loop
library(readr)
librar(dplyr)
library(stringr)
map2(sprintf("C%d_1", 1:15), str_c("C", 1:5), f1)
where
f1 <- function(nm, id) {
read_csv(nm) %>%
select(-c(1, 4, 8:11)) %>%
slice(-1) %>%
na.omit %>%
filter(V3 != "Experiment"| V5 != "Key: Return") %>%
arrange(V6) %>%
mutate(q_id = strrep(c("q1","q2","q3","q4"),times = 1),
response = str_match(V5,("[0-9]+") %>% unlist %>% as.numeric) %>%
select(-c(1, 3, 4)) %>%
set_names(c("ad_id","rt","q_id","response")) %>%
mutate(id = id)
}

Combine list of data frames with one column of characters

I am learning to get, cleaning and combining data. I am confused why in a loop rbind command result in returning 10 data instead of expected 30 data as when I combine it manually (i by i).
library(XML)
mergeal <- NULL
tabnums <- 3
for (i in 1:length(tabnums)) {
bnn <- paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
tabnums[i], "&&coming=22-Oct-2015&coming=22-Oct-2015")
tem <- readHTMLTable(bnn, header=T, stringsAsFactors=F)
#data cleaning
ff <- tem[8] #wanted data
ff1 <- as.data.frame(ff)
ff2 <- ff1[ , 1] #get 1st col data only
ff3 <- unique(ff2)
ff4 <- ff3[c(2,5:13)] #wanted list only
#merging dataset
mergeal <- rbind(mergeal, ff4)
}
I've tried using list rbind list of data frames with one column of characters and numerics but still have the same result as above. Appreciate any help on what I missed, thanks.
I cleaned up the data cause I was bored.
library(plyr)
library(XML)
library(dplyr)
library(magrittr)
library(stringi)
library(tidyr)
library(lubridate)
answer =
data_frame(tabnums = 1:3) %>%
group_by(tabnums) %>%
do(.$tabnums %>%
paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
., "&&coming=22-Oct-2015&coming=22-Oct-2015") %>%
readHTMLTable(header = T, stringsAsFactors = F) %>%
extract2(8)) %>%
ungroup %>%
select(V1) %>%
distinct %>%
mutate(V1 =
V1 %>%
stri_replace_all_fixed("Â", "\n") %>%
stri_replace_all_fixed("Type:", "\nType:") %>%
stri_replace_all_fixed("Time:", "\nTime:") %>%
stri_replace_all_fixed("Area:", "\nArea:") %>%
stri_split_fixed("\n")) %>%
unnest(V1) %>%
mutate(V1 = V1 %>% stri_trim) %>%
filter(V1 %>% stri_detect_regex("^There are currently") %>% `!`) %>%
filter(V1 != "") %>%
separate(V1, c("variable", "value"), sep = ":", fill = "left") %>%
mutate(variable = variable %>% mapvalues(NA, "Description"),
ID = variable %>% `==`("Description") %>% cumsum) %>%
spread(variable, value) %>%
mutate(Area = Area %>% extract_numeric,
Price = Price %>% extract_numeric,
Datetime =
Time %>%
stri_replace_all_fixed("a.m.", "am") %>%
stri_replace_all_fixed("p.m.", "pm") %>%
paste(Date, .) %>%
dmy_hm) %>%
select(-Date, -Time)

Resources