I want to separate one column into 4 columnes by "-", keep the new 4 columnes, drop the original colume, rename the new 4 columnes. Then apply the same action to several columes.
I used the stupid way...I tried to write function with separate and paste, but failed...
R code as below:
tabled <- table_d %>%
separate("A3 CAB PA",into=c("A3 CAB PA_count","A3 CAB PA_sumkm","A3 CAB PA_drive","A3 CAB PA_drive2"),sep="-") %>%
separate("A4 Allroad B9",into=c("A4 Allroad B9_count","A4 Allroad B9_sumkm","A4 Allroad B9_drive","A4 Allroad B9_drive2"),sep="-") %>%
separate("A5 Cabriolet B9",into=c("A5 Cabriolet B9_count","A5 Cabriolet B9_sumkm","A5 Cabriolet B9_drive","A5 Cabriolet B9_drive2"),sep="-") %>%
and so on...
Is that possible to define a function(x) and use lapply(data[,-1],function(x)) to replace the long code above?
I made a data sample with your structure
library(tidyr)
A <- c("7-4-5-9", NA)
B <- c("6-4-5-1", "7-8-6-3")
dat <- data.frame(A,B, stringsAsFactors = FALSE) #example data
for (col in names(dat)){ # you will propably need to assign a subset of columns with something like wanted_cols %in% names(dat))
namecol1 <- paste(col, "count", sep ="_")
namecol2 <- paste(col, "sumkm", sep ="_")
namecol3 <- paste(col, "drive", sep ="_")
namecol4 <- paste(col, "drive2", sep ="_") #set the wanted column structure
cols <- rlang::sym(col)
dat <- dat %>% separate(!!cols, into = c(namecol1, namecol2, namecol3, namecol4), sep = "-", remove = TRUE) # separate and remove the original column
}
Related
Have a data frame with a concatenated column that I want to order numerically with the number after -
df <- data.frame(Order = c("A23_2-A27_3-A40_4-A10_1", "A25_2-A21_3-A11_1", "A9_1", "A33_2-A8_1"))
and want to have a result like this:
df <- data.frame(Order = c("A10A23A27A40", "A11A25A21", "A9", "A8A33"))
tried couple of things with tidyverse but couldn't get a clean result.
df %>%
rowid_to_column() %>%
separate_rows(Order, sep='-') %>%
separate(Order, c('Order', 'v'), convert = TRUE) %>%
arrange(v)%>%
group_by(rowid) %>%
summarise(Order = str_c(Order, collapse = ''))
# A tibble: 4 x 2
rowid Order
<int> <chr>
1 1 A10A23A27A40
2 2 A11A25A21
3 3 A9
4 4 A8A33
Another base R approach:
df$Order <- sapply(strsplit(df$Order, '-'), function(x) {
spl <- strsplit(x, '_') # split by '_'
spl <- do.call(rbind, spl) # create a 2-column matrix
ord <- order(as.numeric(spl[, 2])) # order of numeric parts
paste(spl[ord, 1], collapse='') # concatenate in correct order
})
Here is a base R option:
df$Order <-
sapply(strsplit(df$Order, "-"), function(x)
paste0(gsub("\\_.*", "", x[order(as.numeric(sub("^[^_]*_", "", x)))]), collapse = ""))
Output
Order
1 A10A23A27A40
2 A11A25A21
3 A9
4 A8A33
Or a tidyverse option:
library(tidyverse)
df %>%
mutate(Order = map(str_split(Order, "-"), ~
str_c(
str_replace_all(.x[order(as.numeric(str_replace_all(.x, "^[^_]*_", "")))], "\\_.*", ""), collapse = ""
)))
I'm looking for a more efficient way to write the following:
Read in all my Excel files
DF1 <- read_excel(DF1, sheet = "ABC", range = cell_cols(1:10) )
DF2 <- read_excel(DF2, sheet = "ABC", range = cell_cols(1:10) )
etc...
DF50 <- read_excel(DF50, sheet = "ABC", range = cell_cols(1:10) )
Add a column to each DF with a location
DF1$Location <- location1
DF2$Location <- location2
etc...
DF50$Location <- location50
Keep only columns with specified names, get rid of blank rows, and convert column CR_NUMBER to an integer
library(hablar)
DF1 <- DF1 %>% select(all_of(colnames_r)) %>% filter(!is.na(NAME)) %>% convert(int(CR_NUMBER))
DF2 <- DF2 %>% select(all_of(colnames_r)) %>% filter(!is.na(NAME)) %>% convert(int(CR_NUMBER))
etc...
DF50 <- DF50 %>% select(all_of(colnames_r)) %>% filter(!is.na(NAME)) %>% convert(int(CR_NUMBER))
You can try to use the following getting the data in a list :
library(readxl)
library(hablar)
library(dplyr)
#Get the complete path of file which has name "DF" followed by a number.
file_names <- list.files('/folder/path', pattern = 'DF\\d+', full.names = TRUE)
list_data <- lapply(seq_along(file_names), function(x) {
data <- read_excel(file_names[x], sheet = "ABC", range = cell_cols(1:10))
data %>%
mutate(Location = paste0('location', x))
select(all_of(colnames_r)) %>%
filter(!is.na(NAME)) %>%
convert(int(CR_NUMBER))
})
list_data is a list of dataframes which is usually better to manage instead of having 50 dataframes in global environment. If you still want all the dataframes separately name the list and use list2env.
names(list_data) <- paste0('DF', seq_along(list_data))
list2env(list_data, .GlobalEnv)
I need to insert Column Name, Department, into its value. i have code like here:
Department <- c("Store1","Store2","Store3","Store4","Store5")
Department2 <- c("IT1","IT2","IT3","IT4","IT5")
x <- c(100,200,300,400,500)
Result <- data.frame(Department,Department2,x)
Result
The expected result is like:
Department <- c("Department_Store1","Departmentz_Store2","Department_Store3","Department_Store4","Department_Store5")
Department2 <- c("Department2_IT1","Department2_IT2","Department2_IT3","Department2_IT4","Department2_IT5")
x <- c(100,200,300,400,500)
Expected.Result <- data.frame(Department,Department2,x)
Expected.Result
Can somebody help? Thanks
Another way with dplyr and tidyr:
library(dplyr)
library(tidyr)
# Convert to character to avoid warning message, will convert all columns to character
Result[] <- lapply(Result, as.character)
Result %>%
mutate_if(is.factor, as.character) %>% # optional, only convert factor to character, retain all other types
gather(key, value, -x) %>%
mutate(var = paste(key, value, sep = "_")) %>%
select(-value) %>%
spread(key,var)
x Department Department2
1 100 Department_Store1 Department2_IT1
2 200 Department_Store2 Department2_IT2
3 300 Department_Store3 Department2_IT3
4 400 Department_Store4 Department2_IT4
5 500 Department_Store5 Department2_IT5
Data:
Result <- data.frame(
Department = c("Store1","Store2","Store3","Store4","Store5"),
Department2 = c("IT1","IT2","IT3","IT4","IT5"),
x = c(100,200,300,400,500)
)
If you gather the column names in question into a vector dep_col, this is a clean base R solution with a for loop:
df <- data.frame(x = 1:5,
Department = paste0("Store", 1:5),
Department2 = paste0("IT", 1:5))
dep_col <- names(df)[-1]
for (c in dep_col)
df[[c]] <- paste(c, df[[c]], sep = "_")
If I understand correctly, the OP wants to prepend the values in all columns starting with "Department" by the respective column name.
Edit By request of the OP, the code to select columns has been generalized to pick additional column names.
Here is a solution using data.table's fast set() function:
library(data.table)
setDT(Result)
cols <- stringr::str_subset(names(Result), "^(Department|Division|Team)")
for (j in cols) {
set(Result, NULL, j, paste(j, Result[[j]], sep = "_"))
}
Result
Department Department2 x
1: Department_Store1 Department2_IT1 100
2: Department_Store2 Department2_IT2 200
3: Department_Store3 Department2_IT3 300
4: Department_Store4 Department2_IT4 400
5: Department_Store5 Department2_IT5 500
Note that set() updates by reference, i.e., without copying the whole object.
I'm trying to write a function that takes as one of its arguments a vector of column names from user. The column names will be used to specify what columns of the dataframe will be pasted together to form a new column within dplyr::mutate. I tried to collapse the elements of argument vector first and then use the collapsed string in mutate - this is wrong. See that latest attempt below. I made other attempts but I'm not understanding the new quo, enquo, UQ, !!!, !!, and so on within dplyr. Can someone show what I need to do?
df <- data.frame(.yr = c("2000", "2001", "2002"), .mo = c("12", "01", "02"), .other = rnorm(3))
cols <- colnames(df)[1:2]
do_want <- df %>%
mutate(new = paste(.yr, .mo, sep = "-"))
my_func <- function(dat, vars){
.vars <- paste(vars, collapse = ",")
result <- dat %>%
mutate(new = paste(.vars, sep = "-" ))
return(result)
}
my_func(dat = df, vars = cols)
edit: this is my attempt at using quo and !! in the function definition. the result is a column of repeated string ".yr,.mo"
my_func <- function(dat, vars){
.vars <- quo(paste(vars, collapse = ","))
result <- dat %>%
mutate(new = paste(!!.vars, sep = "-" ))
return(result)
}
Because you have a list of strings, you can use rlang::syms in your function to take the strings and turn them into symbols. Then you can use !!! to splice the arguments together to put into paste.
my_func <- function(dat, vars){
.vars <- rlang::syms(vars)
result <- dat %>%
mutate(new = paste(!!!.vars, sep = "-" ))
return(result)
}
my_func(dat = df, vars = cols)
.yr .mo .other new
1 2000 12 -0.2663456 2000-12
2 2001 01 0.5463433 2001-01
3 2002 02 -1.3133078 2002-02
Use unite.
names <- iris %>% colnames()
iris %>% mutate(new = paste(names)) #Error
iris %>% unite("new",names,remove=F) #OK
Use mutate_ instead of mutate & turning the expression into a string worked for me:
dplyr_solution <- function(dat, vars){
.vars <- paste(vars, collapse = ",")
result <- dat %>%
mutate_(new = paste0('paste(', .vars, ', sep="-")'))
return(result)
}
dplyr_solution(dat = df, vars = cols)
Need Help! this is a work related project. I need to clean 16,000 emails... Expected to do by hand :( I need to find a away to pull the domain name from the email and place it into a new column, and parse the name into a new column as well, while still keeping the original email. The data is partially complete.
library(tidyr)
library(magrittr)
Email.Address <- c('john.doe#abccorp.com','jdoe#cisco.com','johnd#widgetco.com')
First.Name <- c('John', 'JDoe','NA' )
Last.Name <- c('Doe','NA','NA')
Company <- c('NA','NA','NA')
data <- data.frame(Email.Address, First.Name, Last.Name, Company)
separate_DF <- data %>% separate(Email.Address, c("Company"), sep="#")
try this
df <- data.frame(Email.Address, First.Name, Last.Name, Company, stringsAsFactors = FALSE)
Corp <- sapply(strsplit(sapply(strsplit(df$Email.Address,"#"),"[[",2),"[.]"),"[[",1)
F.Name <- sapply(strsplit(sapply(strsplit(df$Email.Address,"#"),"[[",1), "[.]"),"[[",1)
L.Name <- sapply(strsplit(sapply(strsplit(df$Email.Address,"#"),"[[",1),"[.]"),tail,n=1)
L.Name[L.Name == F.Name] <- NA
OUT <- data.frame(df$Email.Address, F.Name, L.Name, Corp)
df[df=="NA" |is.na(df)] <- OUT[df=="NA" |is.na(df)]
df
the function separate from tidyr looks good too.
http://blog.rstudio.org/2014/07/22/introducing-tidyr/
From the information you have given, this also works:
library(tidyr)
df <- data.frame(Email.Address, First.Name, Last.Name, Company)
df2 <- separate(df, Email.Address, into = c("Name", "Corp"), sep = "#")
df2 <- separate(df2, Name, into = c("F.Name", "L.Name"), sep = "[.]", extra = "drop")
df2 <- separate(df2, Corp, into = c("Corp", "com"), sep = "[.]", extra = "drop")