assign dynamic variable names using a loop and mutate from dplyr - r

I would like to split a character field into individual variables, one for each character in a string.
library(dplyr)
temp1 <- data.frame(a = c('dedefdewfe' , 'rewewqreqw'))
for(i in 1:10){
temp1 <- temp1 %>%
mutate(paste('v' , i , ,sep = '') = substr(a , i , i))
}
The resulting dataframe would have 11 variables, the original a , v1 through v10

tidyr::separate is good for this. You can't split on an empty string, but you can specify splitting positions ...
library(tidyr)
library(dplyr)
temp1 %>%
mutate(b=a) %>% ## make a copy
separate(b,into=paste0("v",1:10),sep=1:9)
(probably better practice to use nc <- nchar(temp1$a[1]) and then use nc, nc-1 instead of 10, 9 respectively)

Related

How to insert a part of a value of a column into a new column

I have not been programming for that long and have now encountered a problem to which I have not yet been able to find a solution.
In my dataframe there is a column that contains several pieces of information. For example, one row looks like this:
sp|O94910|AGRL1_HUMAN
or like this
sp|Q13554|KCC2B_HUMAN;sp|Q13555|KCC2G_HUMAN
Now I want to create a new column with the combination of digits between the two vertical bars.
For the upper example it would be O94910, for the lower Q13554; Q13555
I have already tried functions like str_extract_all, str_match or gsub. But nothing worked.
The "id" is the column I look at. It includes different combinations of digits. I need the one between the two |
> dput(head(anaDiff_PD_vs_CTRL$id, 10))
c("sp|O94910|AGRL1_HUMAN", "sp|P02763|A1AG1_HUMAN", "sp|P19652|A1AG2_HUMAN",
"sp|P25311|ZA2G_HUMAN", "sp|Q8NFZ8|CADM4_HUMAN", "sp|P08174|DAF_HUMAN",
"sp|Q15262|PTPRK_HUMAN", "sp|P78324|SHPS1_HUMAN;sp|Q5TFQ8|SIRBL_HUMAN;sp|Q9P1W8|SIRPG_HUMAN",
"sp|Q8N3J6|CADM2_HUMAN", "sp|P19021|AMD_HUMAN")>
With dplyr and stringr you can try...
library(dplyr)
library(stringr)
dat %>%
rowwise() %>%
mutate(dig = str_extract_all(col, "(?<=sp\\|)[A-Z0-9]+(?=\\|)"),
dig = paste0(dig, collapse = "; "))
#> # A tibble: 4 x 2
#> # Rowwise:
#> col dig
#> <chr> <chr>
#> 1 sp|Q8NFZ8|CADM4_HUMAN Q8NFZ8
#> 2 sp|94910|AGRL1_HUMAN 94910
#> 3 sp|O94910|AGRL1_HUMAN O94910
#> 4 sp|Q13554|KCC2B_HUMAN;sp|Q13555|KCC2G_HUMAN Q13554; Q13555
data
dat <- data.frame(col = c("sp|Q8NFZ8|CADM4_HUMAN", "sp|94910|AGRL1_HUMAN", "sp|O94910|AGRL1_HUMAN", "sp|Q13554|KCC2B_HUMAN;sp|Q13555|KCC2G_HUMAN"))
Created on 2022-02-02 by the reprex package (v2.0.1)
Here is a solution without tidyverse:
dat <- read.table(text = "
sp|Q8NFZ8|CADM4_HUMAN
sp|94910|AGRL1_HUMAN
sp|O94910|AGRL1_HUMAN
sp|Q13554|KCC2B_HUMAN;sp|Q13555|KCC2G_HUMAN")
ids <- strsplit(dat$V1, ";")
ids <- lapply(ids, function(x) gsub("sp\\|([[:alnum:]]*)\\|.*", "\\1", x))
ids <- lapply(ids, function(x) paste(x, collapse="; "))
dat$newcol <- unlist(ids)
Even with tidyverse, I would define a helper function for more clarity:
extract_ids <- function(x) {
ids <- strsplit(x, ";")
ids <- map(ids, ~ gsub("sp\\|([[:alnum:]]*)\\|.*", "\\1", .))
ids <- map(ids, ~ paste(., collapse="; "))
unlist(ids)
}
dat <- dat %>% mutate(ids = extract_ids(V1))
This solution should help if you want to change your column names in a similar fashion:
library(tidyverse)
# create test data frame with column names "sp|O94910|AGRL1_HUMAN" and "sp|Q13554|KCC2B_HUMAN;sp|Q13555|KCC2G_HUMAN"
col1 <- c(1,2,3,4,5)
col2 <- c(6,7,8,9,10)
df <- data.frame(col1, col2)
names(df)[1] <- "sp|O94910|AGRL1_HUMAN"
names(df)[2] <- "sp|Q13554|KCC2B_HUMAN;sp|Q13555|KCC2G_HUMAN"
names <- as.data.frame((str_split(colnames(df), "\\|", simplify = TRUE))) # split the strings representing the column names seperated by "|" into a list
# remove all strings that contain less digits than letters or special characters
for(i in 1:nrow(names)) {
for(j in 1:ncol(names)){
if ( (str_count(as.vector(str_split(names[i,j], "\\|", simplify = TRUE)), "[0-9]") >
str_count(as.vector(str_split(names[i,j], "\\|", simplify = TRUE)), "[:alpha:]|[:punct:]") )){
names[i,j] <- names[i,j]
} else {
names[i,j] <- ""
}
}
}
# combine the list columns into a single column calles "colnames"
names <- names %>% unite("colnames", 1:5, na.rm = TRUE, remove = TRUE, sep = ";")
# remove all ";" separators at the start of the strings, the end of the strings, and series of ";" into a single ";"
for (i in 1:nrow(names)){
names[i,] <- str_replace(names[i,],"\\;+$", "") %>%
str_replace("^\\;+", "") %>%
str_replace("\\;{2}", ";")
}
# convert column with new names into a vector
new_names <- as.vector(names$colnames)
# replace old names with new names
names(df) <- new_names

Fully programmatically rename columns in R with dplyr

I have sensor data, for several different sensor types, in many dataframes. I need to perform inner_joins on the dataframes so that I end up with one dataframe. The column names of the dataframes for a given sensor type are identical, e.g.
> z501h001
timeBgn soilTempMean soilTempVar
1 01:00:00 100 4
2 01:30:00 112 6
3 02:00:00 111 6
> z501h002
timeBgn soilTempMean soilTempVar
1 01:00:00 120 4
2 01:30:00 122 6
3 02:00:00 121 5
except there are way more columns. The column names are different for different types of sensors (they all have timeBgn in common) .
I need (in R) a flexible way to rename the columns (so I can tell which column corresponds to which sensor) based on adding a suffix to the existing column names for all columns except timeBgn (which is the common column by which the inner_join will be done).
Here is the Python / Pandas equivalent of what I am trying to do:
def rename_cols_by_sensor(df, sensor_name):
cols = df.columns
new_cols = [f'{c}_{sensor_name}' if c!='timeBgn' else c for c in cols]
df.columns = new_cols
I found most of a solution here:
programmatically rename columns in dplyr
The problem is that I cannot figure out how to make the cnames vector programmatically. I do not want to hard-code all of the myriad column names. As an example for z501h001 it would need to look like
cnames <- c('soilTempMean' = 'soilTempMean_z501h001', 'soilTempVar' = 'soilTempVar_z501h001')
the suffix (in the example: _z501h001) can be passed to the function so there is no need to discuss obtaining it here. The original names are easily obtained using names(df). All I need to know is how to put them together in this c("character" = "other_character") format.
I have tried:
rename_by_loc <- function(df, loc) {
old_names <- names(df)
new_names <- c()
loc = z501h001
for (name in old_names) {
if (name != "timeBgn") {
new_names <- c(new_names, paste(name, paste(name, loc, sep="_"), sep = " = ") )
}
}
return(new_names)
}
but that gives me names like "soilTempMean = soilTempMean_z501h001"
I need the = to be outside of the character strings. I have tried a few other things. None have been successful.
This problem is trivial using Pandas which makes me think I am missing something about column renaming in R.
Thanks.
We can use mget to get all the values of the objects with the pattern for object names starts with 'z' followed by 3 digits, 'h', and then 3 digits in a list, then use imap to loop over the list and rename all those columns except 'timeBgn' by concatenating (str_c) the original column with the object name
library(dplyr)
library(purrr)
library(stringr)
out <- mget(ls(pattern = "^z\\d{3}h\\d{3}$")) %>%
imap(~ {
nm1 <- .y
.x %>%
rename_with(~ str_c(., "_", nm1), -timeBgn)
})
The output will be a list. If we need to change the column name in the original object (not recommended), use list2env
list2env(out, .GlobalEnv)
Or using base R
v1 <- ls(pattern = "^z\\d{3}h\\d{3}$")
for(v in v1) {
tmp <- get(v)
i1 <- names(tmp) != 'timeBgn'
names(tmp)[i1] <- paste0(names(tmp)[i1], '_', v)
assign(v, tmp)
}

How do I get the column number from a dataframe which contains specific strings?

I have a data frame df with 7 columns and I have a list z containing multiple strings.
I want a dataframe containing only the columns in df which contain the sting from z.
df <- data.frame("a_means","b_means","c_means","d_means","e_mean","f_means","g_means")
z <- c("a_m","c_m","f_m")
How do I get the column number of the z strings in df? Or how do I get a dataframe with only the columns which contains the z strings.
What I want is:
print(df)
"a_means" "c_m" "f_m"
What I tried:
match(a, names(df)
and
df[,which(colnames(df) %in% colnames(df[ ,grepl(z,names(df)])]
You can use:
df[,match(z, substring(colnames(df), 1, 3))]
With base R:
z <- paste(z, collapse = "|")
df[, grepl(z, names(df))] # you could use grep as well
Combine the search patterns and use that as a pattern for stringr::str_detect() function.
library(dplyr)
library(stringr)
df <- data.frame(a_means = "a_means",
b_means = "b_means",
c_means = "c_means",
d_means = "d_means",
e_means = "e_means",
f_means = "f_means",
g_means = "g_means"
)
z <- c("a_m","c_m","f_m")
z <- paste(z, collapse = "|")
df %>% select_if(str_detect(names(df), z))
#> a_means c_means f_means
#> 1 a_means c_means f_means
You can simply do this:
library(dplyr)
df %>%
select(contains(z))
Check out help("starts_with"). You can also match to a starting prefix with starts_with() among other things.
You can use select and matches to subest the columns based on z
library(dplyr)
df <- data.frame("a_means","b_means","c_means","d_means","e_mean","f_means","g_means")
z <- c("a_m","c_m","f_m")
df %>%
select(matches(z))
#> X.a_means. X.c_means. X.f_means.
#> 1 a_means c_means f_means

Split columns or vector to pass to Purrr Function

I have a web scrape function that I created that gets data from an API. I pass a df column I have to one of the function arguments in the web scrape function. The issue I'm having is that the URL takes up to 500 numbers in one of the parameters, and my df has 2000 rows.
How would I split the rows by 500 in order to pass the values into the function?
I've created a very basic reprex that shows the workflow of what I am looking to do. I want to pass the split df column to the parse function. I'm guessing I would need to wrap the JSON parse with map_dfr
library(tidyverse)
sample_df <- tibble(id = 1:20,
col_2 = rnorm(1:20))
# parse function
parse_people <- function(ids = c("1", "10"), argument_2 = NULL){
# Fake Base Url
base_url <- "https://www.thisisafakeurl.com/api/people?Ids="
# fix query parameters to collapse Ids to pass to URL
ids<- stringr::str_c(ids, collapse = ",")
url <- glue::glue("{base_url}{ids}")
# Get URL
resp <- httr::GET(url)
# Save Response in JSON Format
out <- httr::content(resp, as = "text", encoding = "UTF-8")
# Read into JSON format.
jsonlite::fromJSON(out, simplifyDataFrame = TRUE, flatten = TRUE)
}
sample_parse <- parse_people(sample_df$id)
I think I probably need to create 2 functions. 1 function that parses the data, and one that uses map_dfr based off of the splits.
Something like:
# Split ID's from DF here. I want blocks of 500 rows to pass below
# Map Split ID's over parse_people
ids %>%
map_dfr(parse_people)
If we need to split the data.frame into a list of data.frame, an option is group_split with gl
library(dplyr)
n <- 3
lst1 <- sample_df %>%
group_split(grp = as.integer(gl(n(), n, n())), keep = FALSE) %>%
map(pull, id)
and pass it to the function as
map(lst1, ~ parse_people(ids = .x))
Possible duplicate here.
In the meantime, you can split your 20 row dataframe into 5 dataframes of 4 rows each via:
sample_df <- tibble(id = 1:20,
col_2 = rnorm(1:20))
split(sample_df, rep(1:5, each = 4))
Then you can pass the resulting list of dataframes to a purrr function.
Edit: If you don't know the total rows in advance, want to split by a given number, but also include all rows, there's another solution in the link:
chunk <- 3
n <- nrow(sample_df)
r <- rep(1:ceiling(n/chunk),each=chunk)[1:n]
d <- split(sample_df,r)
Here I want chunks of 3, but it will include all rows (the last data frame in the list has 2 rows)

I give three arguments, the input df, the column I want to clean,the new column I want to be added with cleansed names. Where am I going wrong?

library(dplyr)
clean_name <- function(df,col_name,new_col_name){
#remove whitespace and common titles.
df$new_col_name <- mutate_all(df,
trimws(gsub("MR.?|MRS.?|MS.?|MISS.?|MASTER.?","",df$col_name)))
#remove any chunks of text where a number is present
df$new_col_name<- transmute_all(df,
gsub("[^\\s]*[\\d]+[^\\s]*","",df$col_name,perl = TRUE))
}
I get the following error
"Error: Column new_col_name must be a 1d atomic #vector or a list"
what you want to do is make sure that the output of the functions you're using is either a vector or a list with only one dimension so that you can add it as a new column in the desired data frame. You can verify the class of an object with the Class function which comes within the base package.
The mutate function by itself should do what you want, it returns the same data frame but with the new column:
library(dplyr)
clean_name <- function(df, col_name, new_col_name) {
# first_cleaning_to_colname = The first change you want to make to the col_name column. This should be a vector.
# second_cleaning_to_colname = The change you're going to make to the col_name column after the first one. This should be a vector too.
first_change <- mutate(df, col_name = first_cleaning_to_colname)
second_change <- mutate(first_change, new_col_name = second_cleaning_to_colname)
return(second_change)
}
You can make both this changes at the same time but I thought this way it's easier to read.
If we are passing unquoted column names, then use
library(tidyverse)
clean_name <- function(df,col_name, new_col_name){
col_name <- enquo(col_name)
new_col_name <- enquo(new_col_name)
df %>%
mutate(!! new_col_name :=
trimws(str_replace_all(!!col_name, "MR.?|MRS.?|MS.?|MISS.?|MASTER.?","")) ) %>%
transmute(!! new_col_name := trimws(str_replace_all(!! new_col_name,
"[^\\s]*[\\d]+[^\\s]*","")))
}
clean_name(dat1, col1, colN)
# colN
#1 one
#2 two
data
dat1 <- data.frame(col1 = c("MR. one", "MS. two 24"), stringsAsFactors = FALSE)

Resources