I have a very large dataframe with a column containing postal codes:
data <- data.frame(data = rnorm(n = 4),
code = c("1001", "1130", "2001", "9010"),
stringsAsFactors = F)
I also have a second large-ish dataframe with postal codes patterns mapped to a zone.
mapping <- data.frame(code = c("10*", "20*"),
zone = c("zone1", "zone2"),
stringsAsFactors = F)
I would like to join those two tables to add the zone column to the data dataframe but the volume of the data is too large to do a "rowwise" grepl. What is the most efficient way of doing this?
The most efficient way to deal with large objects is data.table. To do joins, you need a common column in both objects. I'm using substr to get only the first two digits of the code column in the data object. Also note that I removed the "*" from mapping as that character is not present in data.
library(data.table)
setDT(data)
setDT(mapping)
data[, code := substr(code, start = 1, stop = 2)]
mapping[data, on="code"]
code zone data
1: 10 zone1 -1.0481912
2: 11 <NA> 1.1339476
3: 20 zone2 -0.8072921
4: 90 <NA> 1.5883562
DATA
data <- data.frame(data = rnorm(n = 4),
code = c("1001", "1130", "2001", "9010"),
stringsAsFactors = F)
mapping <- data.frame(code = c("10", "20"),
zone = c("zone1", "zone2"),
stringsAsFactors = F)
I am not sure what specific method you are using when you say "rowwise" but here is what I would do in the dplyr world.
mapping <- dplyr::rename(mapping, codeString = code) # rename for joining.
data <- data %>%
dplyr::mutate( codeString = paste0(substr(code, 1, 2), "*")) %>%
dplyr::left_join(mapping, by= "codeString")
You should be able to join like this and avoid any rowwise operation since the patter you're looking for is easy to create.
I have a column in dataframe df with value 'name>year>format'. Now I want to split this column by > and add those values to new columns named as name, year, format. How can I do this in R.
You can do that easily using separate function in tidyr;
library(tidyr)
library(dplyr)
data <-
data.frame(
A = c("Joe>1993>student")
)
data %>%
separate(A, into = c("name", "year", "format"), sep = ">", remove = FALSE)
# A name year format
# Joe>1993>student Joe 1993 student
If you do not want the original column in the result dataframe change remove to TRUE
An option is read.table in base R
cbind(df, read.table(text = as.character(df$column), sep=">",
header = FALSE, col.names = c("name", "year", "format")))
In case your data is big, it would be a good idea to use data.table as it is very fast.
If you know how many fields your "combined" column has:
Suppose the column has 3 fields, and you know it:
library(data.table)
# the 1:3 should be replaced by 1:n, where n is the number of fields
dt1[, paste0("V", 1:3) := tstrsplit(y, split = ">", fixed = TRUE)]
If you DON'T know in advance how many fields the column has:
Now we can get some help from the stringi package:
library(data.table)
library(stringi)
maxFields <- dt2[, max(stri_count_fixed(y, ">")) + 1]
dt2[, paste0("V", 1:maxFields) := tstrsplit(y, split = ">", fixed = TRUE, fill = NA)]
Data used:
library(data.table)
dt1 <- data.table(x = c("A", "B"), y = c("letter>2018>pdf", "code>2020>Rmd"))
dt2 <- rbind(dt1, data.table(x = "C", y = "report>2019>html>pdf"))
Below i have to tried to reproduce in representable Form
`v<- data.frame(C1TEMP = c(3,6,1,8,9,2,2,9,1,23),
C1VIB = c(5,6,1,8,9,2,2,9,1,23),
C1DE = c(9,6,1,8,9,2,2,9,1,23),
C1NDE = c(8,6,1,8,9,2,2,9,1,23),
C2TEMP = c(5,6,1,8,9,2,2,9,1,23),
C2VIB = c(378,6,1,8,9,2,2,9,1,23),
C2DE = c(3,78,1,8,9,2,2,9,1,23),
C2NDE = c(3,6,1,8,9,2,2,9,1,23),
C3TEMP= c(3,6,89,8,9,2,2,9,1,23),
C3VIB = c(3,6,1,98,9,2,2,9,1,23),
C3DE = c(33,56,91,82,99,12,22,19,81,23),
C3NDE = c(13,76,91,88,59,42,22,39,21,23))`
Here i want to rbind Every 4 column one above each Other with the tag No Along. And No of Columns will always be divisible of 4. I here with also Attaching an image for a clear picture what result should be expected.
EXPECTED OUTPUT:
I agree with YCR's comment. Still, this is a way to tackle your problem. Use the following code:
# data frames need column headers, so convert to matrix
v01 <- as.matrix(v[, 1:4])
v02 <- as.matrix(v[, 5:8])
v03 <- as.matrix(v[, 9:12])
# remove columnnames
colnames(v01) <- NULL
colnames(v02) <- NULL
colnames(v03) <- NULL
# now you can use rbind and give the columnnames back
v2 <- rbind( v01, v02, v03)
colnames(v2) <- c("C1TEMP", "C1VIB", "C1DE", "C1NDE")
v2
try this
It is a bit more convoluted than previous answers but it should be more adaptable to other data frames
# how many blocks have you got?
howMany <-table(gsub(names(v),pattern = "[0-9]",replacement = ""))[1]
# make a common name string
NAMES <- unique(gsub(names(v),pattern = "[0-9]",replacement = ""))
# create a list
list() -> V
for(i in 1:howMany){
# get the column with matching index number
v[,grep(names(v),pattern = i)] -> vi
names(vi) <- NAMES# change name
data.frame(Tag=i,vi) -> V[[i]]# put it in the list
}
# combine tables in the list into one list
do.call(rbind,V)
Nils
The melt and reshape way:
It implies to get an identifier per row:
v<- data.frame(C1TEMP = c(3,6,1,8,9,2,2,9,1,23),
C1VIB = c(5,6,1,8,9,2,2,9,1,23),
C1DE = c(9,6,1,8,9,2,2,9,1,23),
C1NDE = c(8,6,1,8,9,2,2,9,1,23),
C2TEMP = c(5,6,1,8,9,2,2,9,1,23),
C2VIB = c(378,6,1,8,9,2,2,9,1,23),
C2DE = c(3,78,1,8,9,2,2,9,1,23),
C2NDE = c(3,6,1,8,9,2,2,9,1,23),
C3TEMP= c(3,6,89,8,9,2,2,9,1,23),
C3VIB = c(3,6,1,98,9,2,2,9,1,23),
C3DE = c(33,56,91,82,99,12,22,19,81,23),
C3NDE = c(13,76,91,88,59,42,22,39,21,23),
id = 1:10
, stringsAsFactors = F)
library(tidyverse)
# melt the dataframe(reshape from wide to long format):
v_melt <- reshape2::melt(v, id.vars = "id")
# modify the aggregation variables
v_melt <- v_melt %>%
mutate(var = substr(as.character(variable), 3, 8),
group_id = paste0(substr(as.character(variable), 1, 2), "_", id))
# reshape the data frame in a wide format:
v_cast <- reshape2::dcast(v_melt, group_id ~ var, value.var = "value")
I working with two data frames corresponding to the sample below:
# Data sets
set.seed(1)
dta_a <- data.frame(some_value = runif(n = 10),
identifier=c("A0001","A0002","A0003","A0004","A0005",
"A0006","B0001","B0002","B0003","B0004"),
other_val = runif(n = 10))
dta_b <- data.frame(variable_abc = runif(n = 6),
identifier=c("A0001","A0002","A0003,A0004,A0005,C0001",
"B0001,B0002","B0003","B0004"),
variable_df = runif(n = 6))
I would like to merge those two data frames and obtain a data frame similar to the one presented below:
The resulting data frame would have the following qualities:
For the observations where only one identifier is present the merge command performs with all.y = TRUE and all.x = FALSE assuming that y is dta_b.
For the observations where multiple identifiers are provided only the first matched value from the dta_a is taken with the remaining values ignored. If there is no match on the first identifier (A0003) I would like for the command to attempt to match the next one (A0004).
I made a reference to the merge command but, naturally, dplyr and other solutions are fine.
you can 'melt' the dta_b so to have one row per identifier with a preference order and then join all the identifiers:
library(dplyr)
library(tidyr)
melt_dta_b = lapply(1:nrow(dta_b), function(i){
split_identifier = strsplit(as.character(dta_b$identifier[i]), split = ",", fixed = TRUE)[[1]]
data_frame(identifier = split_identifier,
original_identifier = dta_b$identifier[i], original_row = i, preference = 1:length(identifier),
variable_abc = dta_b$variable_abc[i], variable_df = dta_b$variable_df[i])
})
melt_dta_b = rbind_all(melt_dta_b)
At that point you can select only the one with the highest preference score:
joined_df = left_join(melt_dta_b, dta_a) %>%
filter(!is.na(some_value)) %>%
group_by(original_row) %>%
filter(preference == min(preference)) %>%
ungroup()
UPDATE
in order to not explicitly call the variables by name you can use the following code that binds all the 'unused' columns of the orginal df:
melt_dta_b = lapply(1:nrow(dta_b), function(i){
tmp = dta_b[i,]
split_identifier = strsplit(as.character(tmp$identifier), split = ",", fixed = TRUE)[[1]]
colnames(tmp)[2] = "original_identifier"
data_frame(identifier = split_identifier, original_row = i, preference = 1:length(identifier)) %>%
cbind(tmp)
})
melt_dta_b = rbind_all(melt_dta_b)
Just one way of doing it, but not best way I guess. Just made a try.
Split the identifiers and merge according to the first one.
dta_a$identifier = as.vector(dta_a$identifier)
dta_a1 = data.frame(dta_a, identifier_split = do.call(rbind, strsplit(dta_a$identifier, split = ",", fixed = T)))
dta_b$identifier = as.vector(dta_b$identifier)
dta_b1 = data.frame(dta_b, identifier_split = do.call(rbind, strsplit(dta_b$identifier, split = ",", fixed = T)))
dta_join = merge(dta_a1, dta_b1, by = "identifier_split.1", all.x = F, all.y = T)
In cases you don't have a match for the first one, you'll see NAs and you can subset them and merge with second ones ("identifier_split.2")