Normalizing a multivalued column in a large table

Normalizing a multivalued column in a large table - r

I'm having an issue converting the VBA code seen in this this post to an R-script.
The problem is as follows, I have a column (from a source database, not by choice) that contains multiple values for the attribute. I'd like to normalize this table and retain the order in which each value occurs in each cell.
An example dataset:
dat <- data.frame(
ID = c(1:3),
Multi = c("VAL1 VAL2 VAL3","VAL2 VAL3","VAL3 VAL1")
,stringsAsFactors=FALSE)
ID Multi
1 1 VAL1 VAL2 VAL3
2 2 VAL2 VAL3
3 3 VAL2 VAL3 VAL1
The pseudocode would be something like:
Loop over each row
Split string Multi with a space as a separator
For each splitted string, append new row in a separate data.frame with the ID, Order within the total string and the value.
The result would look like:
ID Order Multi
1 1 1 VAL1
2 1 2 VAL2
3 1 3 VAL3
4 2 1 VAL2
5 2 2 VAL3
6 3 1 VAL2
7 3 2 VAL3
8 3 3 VAL1
I'm currently looking at doing so with a data.frame, I'm thinking data.table would be more appropriate as my table will have approximately 400.000 of these rows.
I apologize for not having any code ready, I'm still contemplating whether I need to use the apply family, data.table or a simple for loop. I'll keep this post updated with any progress I make.

Here are a couple of ways...
In base R:
X <- setNames(strsplit(as.character(dat$Multi), " "), dat$ID)
X1 <- stack(X)
X1$order <- ave(X1$ind, X1$ind, FUN = seq_along)
X1
# values ind order
# 1 VAL1 1 1
# 2 VAL2 1 2
# 3 VAL3 1 3
# 4 VAL2 2 1
# 5 VAL3 2 2
# 6 VAL2 3 1
# 7 VAL3 3 2
# 8 VAL1 3 3
OR (better):
X <- strsplit(as.character(dat[, "Multi"]), " ", fixed = TRUE)
len <- vapply(X, length, 1L)
data.frame(ID = rep(dat[, "ID"], len), order = sequence(len),
Multi = unlist(X, use.names=FALSE))
Using concat.split.multiple from my "splitstackshape" package (probably not too efficient on 400,000 rows though).
library(splitstackshape)
out <- concat.split.multiple(dat, "Multi", " ", "long")
out[order(out$ID, out$time), ]
# ID time Multi
# 1 1 1 VAL1
# 4 1 2 VAL2
# 7 1 3 VAL3
# 2 2 1 VAL2
# 5 2 2 VAL3
# 8 2 3 <NA>
# 3 3 1 VAL2
# 6 3 2 VAL3
# 9 3 3 VAL1
And, since you requested "data.table":
library(data.table)
DT <- data.table(dat)
DTL <- DT[, list(unlist(strsplit(as.character(Multi), " "))), by = ID]
DTL[, order := sequence(.N), by = ID]
DTL
# ID V1 order
# 1: 1 VAL1 1
# 2: 1 VAL2 2
# 3: 1 VAL3 3
# 4: 2 VAL2 1
# 5: 2 VAL3 2
# 6: 3 VAL2 1
# 7: 3 VAL3 2
# 8: 3 VAL1 3
Update: Some timings
I didn't bother testing my "splitstackshape" approach, because that uses read.table under the hood, so I know it won't stand up to the demands of performance.
However, base R still seems to win out!
Sample "big" data:
dat_big <- do.call(rbind, replicate(floor(4000/3), dat, simplify = FALSE))
dat_big <- do.call(rbind, replicate(100, dat_big, simplify = FALSE))
dat_big$ID <- make.unique(as.character(dat_big$ID))
DT <- data.table(dat)
DT_big <- data.table(dat_big)
Functions to test:
fun1 <- function(inDF) {
X <- strsplit(as.character(inDF[, "Multi"]), " ", fixed = TRUE)
len <- vapply(X, length, 1L)
data.frame(ID = rep(inDF[, "ID"], len), order = sequence(len),
Multi = unlist(X, use.names=FALSE))
}
fun2 <- function(inDT) {
DTL <- inDT[, list(unlist(strsplit(as.character(Multi), " ", fixed = TRUE))), by = ID]
DTL[, order := sequence(.N), by = ID]
DTL
}
The results for base R:
system.time(outDF <- fun1(dat_big))
# user system elapsed
# 6.418 0.000 6.454
dim(outDF)
# [1] 1066400 3
head(outDF)
# ID order Multi
# 1 1 1 VAL1
# 2 1 2 VAL2
# 3 1 3 VAL3
# 4 2 1 VAL2
# 5 2 2 VAL3
# 6 3 1 VAL2
tail(outDF)
# ID order Multi
# 1066395 1.133299 3 VAL3
# 1066396 2.133299 1 VAL2
# 1066397 2.133299 2 VAL3
# 1066398 3.133299 1 VAL2
# 1066399 3.133299 2 VAL3
# 1066400 3.133299 3 VAL1
The results for "data.table":
system.time(outDT <- fun2(DT_big))
# user system elapsed
# 14.035 0.000 14.057
dim(outDT)
# [1] 1066400 3
outDT
# ID V1 order
# 1: 1 VAL1 1
# 2: 1 VAL2 2
# 3: 1 VAL3 3
# 4: 2 VAL2 1
# 5: 2 VAL3 2
# ---
# 1066396: 2.133299 VAL2 1
# 1066397: 2.133299 VAL3 2
# 1066398: 3.133299 VAL2 1
# 1066399: 3.133299 VAL3 2
# 1066400: 3.133299 VAL1 3

Related

R apply function by colnames with increasing integer

I am scoring a survey (23 items) with multiple options for each item (it is select all answers that apply, not one choice per item), and trying to find the minimum, maximum and average value for each item. I have written code to do this (below) but am wondering if there is a more efficient way than to cut and paste the three lines creating min, max and avg columns for every item.
Here is the reproducible example with 2 questions, 3 answer options for questions:
#Establish dataframe
dt <- data.frame(matrix(sample(0:1,30,replace=TRUE), ncol = 6))
colnames(dt) <- c("OptA_1", "OptB_1", "OptC_1", "OptA_2", "OptB_2", "OptC_2")
#Rescore incorrect value
dt[ ,grepl("OptB_", colnames(dt))] <- ifelse(dt[ ,grepl("OptB_", colnames(dt))]==1, 2, NA)
dt[ ,grepl("OptC_", colnames(dt))] <- ifelse(dt[ ,grepl("OptC_", colnames(dt))]==1, 3, NA)
dt[ ,grepl("OptA_", colnames(dt))] <- ifelse(dt[ ,grepl("OptA_", colnames(dt))]==1, 1, NA)
This is the code to calculate the values (note here Option A, B and C are the answer choices, while the _1 denotes item 1)
##Calculate Values
dt$it1_min <- apply(dt[ ,c("OptA_1", "OptB_1", "OptC_1")], 1, min, na.rm=T)
dt$it1_max <- apply(dt[ ,c("OptA_1", "OptB_1", "OptC_1")], 1, max, na.rm=T)
dt$it1_avg <- rowMeans(dt[ ,c("OptA_1", "OptB_1", "OptC_1")], na.rm=T)
I am wondering if I need to do the above ^ for every single item, or if it's possible to write a function so that I can score all items (OptA_1 through OptA_23) more efficiently.
###potentially repetitive code?
dt$it2_min <- apply(dt[ ,c("OptA_2", "OptB_2", "OptC_2")], 1, min, na.rm=T)
dt$it2_max <- apply(dt[ ,c("OptA_2", "OptB_2", "OptC_2")], 1, max, na.rm=T)
dt$it2_avg <- rowMeans(dt[ ,c("OptA_2", "OptB_2", "OptC_2")], na.rm=T)
Here is what the eventual scoring will look like:
##Eventual scoring
dt$tot_min <- rowSums(dt[ ,c("it1_min", "it2_min")], na.rm=T)
dt$tot_max <- rowSums(dt[ ,c("it1_max", "it2_max")], na.rm=T)
dt$tot_avg <- rowSums(dt[ ,c("it1_avg", "it2_avg")], na.rm=T)

You will need to convert the data to long form first (tidyr::pivot_longer).
library(dplyr)
library(tidyr)
dt_long <- dt %>%
# add identifier for participant
mutate(participant = row_number()) %>%
# convert to long form using pattern
pivot_longer(cols = -participant,
names_pattern = "Opt(.*)_(\\d+)",
names_to = c("answer_choice", "item"),
values_to = "selected")
dt_long
# long form data looks like this
# A tibble: 30 x 4
# participant answer_choice item selected
# <int> <chr> <chr> <dbl>
# 1 1 A 1 NA
# 2 1 B 1 2
# 3 1 C 1 NA
# 4 1 A 2 1
# 5 1 B 2 2
# 6 1 C 2 3
# now group by each participant and item and compute the required fields
dt_long %>%
group_by(item, participant) %>%
summarise(it_min = min(selected, na.rm = TRUE),
it_max = max(selected, na.rm = TRUE),
it_avg = mean(selected, na.rm = TRUE))
#> # A tibble: 10 x 5
#> # Groups: item [2]
#> item participant it_min it_max it_avg
#> <chr> <int> <dbl> <dbl> <dbl>
#> 1 1 1 2 2 2
#> 2 1 2 2 2 2
#> 3 1 3 1 3 2
#> 4 1 4 2 3 2.5
#> 5 1 5 3 3 3
#> 6 2 1 1 1 1
#> 7 2 2 1 3 2
#> 8 2 3 1 3 2
#> 9 2 4 1 3 2
#> 10 2 5 1 2 1.5

You can use data.table to melt your dt long, estimate your indicators by group, and then dcast back to wide format:
library(data.table)
dt = melt(setDT(dt)[, row:=.I], id.vars="row")[, c("variable","grp") := tstrsplit(variable, "_")][]
dcast(dt[, .(it_min = min(value,na.rm=T),
it_max = max(value,na.rm=T),
it_avg = mean(value, na.rm=T)
), by=.(row,grp)],
row~grp,
value.var=c("it_min", "it_max", "it_avg")
)
Output: (note that you used sample() above, without setting a seed, see my reproducible data below)
row it_min_1 it_min_2 it_max_1 it_max_2 it_avg_1 it_avg_2
<int> <num> <num> <num> <num> <num> <num>
1: 1 2 NA 3 NA 2.5 NaN
2: 2 2 1 3 3 2.5 2
3: 3 2 3 3 3 2.5 3
4: 4 1 NA 1 NA 1.0 NaN
5: 5 3 3 3 3 3.0 3
Input Data:
set.seed(123)
dt <- data.frame(matrix(sample(0:1,30,replace=TRUE), ncol = 6))
colnames(dt) <- c("OptA_1", "OptB_1", "OptC_1", "OptA_2", "OptB_2", "OptC_2")
dt[ ,grepl("OptB_", colnames(dt))] <- ifelse(dt[ ,grepl("OptB_", colnames(dt))]==1, 2, NA)
dt[ ,grepl("OptC_", colnames(dt))] <- ifelse(dt[ ,grepl("OptC_", colnames(dt))]==1, 3, NA)
dt[ ,grepl("OptA_", colnames(dt))] <- ifelse(dt[ ,grepl("OptA_", colnames(dt))]==1, 1, NA)

How to populate a cell in a data frame in R based on the word in a previous cell in that same column?

I have a data frame in R that I've edited to look like the following:
enter image description here
I would like to change the cells in column 2 with an 'x' to match the word at the top of the list of x's. For example, I would like column 2 rows 8-20 to list the word "farm_working_expenses", column 2 rows 22-27 to list the word "feed_expenses", etc.
Is there any way to edit the 'x' cells in the data frame to match the contents of a previous cell in the same column?

dplyr::na_if() can be used to replace x with NA.
data1 <- data.frame(col1 = LETTERS[1:10],
col2 = c('val1',rep('x',3),'val2',rep('x',3),'val3','x'))
data1
# col1 col2
# 1 A val1
# 2 B x
# 3 C x
# 4 D x
# 5 E val2
# 6 F x
# 7 G x
# 8 H x
# 9 I val3
# 10 J x
data1$col2 <- dplyr::na_if(data1$col2, 'x')
Using tidyr::fill(), last observation can be carried forward till a new value appears in the same column.
data1 %>% tidyr::fill(col2)
# col1 col2
# 1 A val1
# 2 B val1
# 3 C val1
# 4 D val1
# 5 E val2
# 6 F val2
# 7 G val2
# 8 H val2
# 9 I val3
# 10 J val3
zoo::na.locf can also be used for similar purpose.
data1 <- data.frame(col1 = LETTERS[1:10],
col2 = c('val1',rep(NA,3),'val2',rep(NA,3),'val3',NA))
data1
# col1 col2
# 1 A val1
# 2 B <NA>
# 3 C <NA>
# 4 D <NA>
# 5 E val2
# 6 F <NA>
# 7 G <NA>
# 8 H <NA>
# 9 I val3
# 10 J <NA>
zoo::na.locf(data1$col2)
# [1] val1 val1 val1 val1 val2 val2 val2 val2 val3 val3

split, apply and combine on 2 columns of data

I've got a dataframe consisting of a group and 2 value columns, as such:
group val1 val2
A 5 3
A 2 4
A 3 1
B 3 6
B 2 1
B 0 2
I want to work out the number of rows where val1 > val2, split by subset. Initially I hardcoded this per subgroup with:
number_a <- nrow(subset(df, group=="A" & val1 > val2))
number_b <- nrow(subset(df, group=="B" & val1 > val2))
What's the proper way of automating this? I tried using the split() function but I couldn't work out how to pass in both val1 and val2 column.

Pretty straight forward using data.table
If you want the number of rows
library(data.table)
setDT(df)[, .(RowsNum = sum(val1 > val2)), by = group]
# group RowsNum
# 1: A 2
# 2: B 1
If you looking for split, apply combinations in base R, could also try
sapply(split(df[-1], df[1]), function(x) sum(x[1] > x[2]))
# A B
# 2 1
Or using tapply (also from base R)
tapply(with(df, val1 > val2), df[1], sum)
# group
# A B
# 2 1
If you want the rows themselves
setDT(df)[, .SD[val1 > val2]]
# group val1 val2
# 1: A 5 3
# 2: A 3 1
# 3: B 2 1
Or very simple with base R too
df[with(df, val1 > val2), ]
# group val1 val2
# 1 A 5 3
# 3 A 3 1
# 5 B 2 1
Or
subset(df, val1 > val2)
# group val1 val2
# 1 A 5 3
# 3 A 3 1
# 5 B 2 1

Another option using dplyr
library(dplyr)
filter(df, val1 >val2)
# group val1 val2
#1 A 5 3
#2 A 3 1
#3 B 2 1
If you need the nrows
df %>%
group_by(group) %>%
filter(val1 >val2) %>%
summarise(RowsNum=n())
# group RowsNum
#1 A 2
#2 B 1
Or using aggregate from base R
aggregate(cbind(RowsNum = val1 > val2) ~ group, df, sum)
# group RowsNum
#1 A 2
#2 B 1

You can try this
data <- data.frame(group,val1,val2)
attach(data)
aggregate(val1~group,data[which(val1 > val2),],length)

How to look up values in R based on row and column

i have a data frame
which is
name val1 val2
A 2 3
B 5 6
C 7 9
i want a dataframe
name Grp value
A val1 2
B val1 5
C val1 7
A val2 3
B val2 6
C val2 9
what is the optimal solution ?

It's called reshaping a data.frame from wide to long format. One option to do that would be, using the package tidyr:
library(tidyr)
gather(df, Grp, Value, -name)
# name Grp Value
#1 A val1 2
#2 B val1 5
#3 C val1 7
#4 A val2 3
#5 B val2 6
#6 C val2 9

Or just use melt from reshape2 package
library(reshape2)
melt(df)
# name variable value
# 1 A val1 2
# 2 B val1 5
# 3 C val1 7
# 4 A val2 3
# 5 B val2 6
# 6 C val2 9
Edit: per #AnandaMahtos comment, if you insist on having your variable name called Grp, you can alternatively do:
melt(df, variable.name = "Grp")

A base R option would be to use stack
cbind(name=df$name, stack(df, select=-name))
# name values ind
#1 A 2 val1
#2 B 5 val1
#3 C 7 val1
#4 A 3 val2
#5 B 6 val2
#6 C 9 val2

Or without any extra package:
ddff<- data.frame(name=c("A","B","C"),val1=c(2,5,7),val2=c(3,6,9))
ddff2<- reshape(ddff,direction="long", varying = list(names(ddff)[2:3]),v.names=c("values"),times=c("val1","val2"))[,1:3]
ddff2
# name time values
#1.val1 A val1 2
#2.val1 B val1 5
#3.val1 C val1 7
#1.val2 A val2 3
#2.val2 B val2 6
#3.val2 C val2 9

Remove duplicates based on 2nd column condition

I am trying to remove duplicate rows from a data frame based on the max value on a different column
So, for the data frame:
df<-data.frame (rbind(c("a",2,3),c("a",3,4),c("a",3,5),c("b",1,3),c("b",2,6),c("r",4,5))
colnames(df)<-c("id","val1","val2")
id val1 val2
a 2 3
a 3 4
a 3 5
b 1 3
b 2 6
r 4 5
I would like to keep remove all duplicates by id with the condition that for the corresponding rows they do not have the maximum value for val2.
Thus the data frame should become:
a 3 5
b 2 6
r 4 5
-> remove all a duplicates but keep row with the max value for df$val2 for subset(df, df$id=="a")

Using base R. Here, the columns are factors. Make sure to convert it to numeric
df$val2 <- as.numeric(as.character(df$val2))
df[with(df, ave(val2, id, FUN=max)==val2),]
# id val1 val2
#3 a 3 5
#5 b 2 6
#6 r 4 5
Or using dplyr
library(dplyr)
df %>%
group_by(id) %>%
filter(val2==max(val2))
# id val1 val2
#1 a 3 5
#2 b 2 6
#3 r 4 5

One possible way is to use data.table
library(data.table)
setDT(df)[, .SD[which.max(val2)], by = id]
## id val1 val2
## 1: a 3 5
## 2: b 2 6
## 3: r 4 5

Here's how I hope your data is really set up
df <- data.frame (id = c(rep("a", 3), rep("b", 2), "r"),
val1 = c(2, 3, 3, 1, 2, 4), val2 = c(3, 4, 5, 3, 6, 5))
You could do a split-unsplit
> unsplit(lapply(split(df, df$id), function(x) {
if(nrow(x) > 1) {
x[duplicated(x$id) & x$val2 == max(x$val2),]
} else {
x
}
}), levels(df$id))
# id val1 val2
# 3 a 3 5
# 5 b 2 6
# 6 r 4 5
You can also use Reduce(rbind, ...) or do.call(rbind, ...) in place of unsplit

Another one
df %>% group_by(id) %>%
slice(which.max(val2))
id val1 val2
a 3 5
b 2 6
r 4 5

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Normalizing a multivalued column in a large table - r

Related

R apply function by colnames with increasing integer

How to populate a cell in a data frame in R based on the word in a previous cell in that same column?

split, apply and combine on 2 columns of data

How to look up values in R based on row and column

Remove duplicates based on 2nd column condition

Categories

Resources