Conditionally merge rows - r

I am doing some tricky data cleaning. I have one dataset (first extract below) that is the output from the digitization of pdf tables. Unfortunately columns were not digitized properly. Sometimes, what shall be in column X3 ended up concatenated in column X2 with the last word of column X2...
What I am trying to do is to bring back what should be in column X3 to X3 and collapse the two rows in X2 together.
I have attached an extract of the output I am trying to create.
Any idea about how can I do this?
Thank you!
structure(list(X1 = c(111L, NA, 2L, NA, NA, 121L, NA, NA, 121L,
NA, NA, 141L, NA, NA, 141L, NA), X2 = structure(c(7L, 1L, 8L,
1L, 1L, 9L, 1L, 1L, 6L, 3L, 1L, 5L, 2L, 1L, 10L, 4L), .Label = c("",
"A - BWHITE", "ASMITH", "B - DBURNEY", "Garden Harris", "House M. Aba",
"House M. Bab", "House M. Cac", "Street M. Bak", "Villa Thomas"
), class = "factor"), X3 = structure(c(2L, 1L, 3L, 1L, 1L, 4L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "A",
"A - C", "D"), class = "factor")), class = "data.frame", row.names = c(NA,
-16L))
structure(list(X1 = c(111L, NA, 2L, NA, NA, 121L, NA, NA, 121L,
NA, NA, 141L, NA, NA, 141L), X2 = structure(c(4L, 1L, 5L, 1L,
1L, 6L, 1L, 1L, 3L, 1L, 1L, 2L, 1L, 1L, 7L), .Label = c("", "Garden Harris WHITE",
"House M. Aba SMITH", "House M. Bab", "House M. Cac", "Street M. Bak",
"Villa Thomas BURNEY"), class = "factor"), X3 = structure(c(2L,
1L, 4L, 1L, 1L, 6L, 1L, 1L, 2L, 1L, 1L, 3L, 1L, 1L, 5L), .Label = c("",
"A", "A - B", "A - C", "B - D", "D"), class = "factor")), class = "data.frame", row.names = c(NA,
-15L))
Follow up question here: Cleaning extract_tables conditional merge rows, systematic extraction

You could use tidyverse:
library(tidyr)
library(stringr)
library(dplyr)
df %>%
filter(X2 != "") %>%
mutate(
extract_name = lead(str_extract(X2, "(?<=[A-Z])[A-Z]+")),
extract_part = lead(str_extract(X2, "[A-Z](\\s-\\s[A-Z]){0,1}(?=[A-Z]+)")),
new_X2 = ifelse(!is.na(extract_name), paste(X2, extract_name), as.character(X2)),
new_X3 = ifelse(X3 != "", as.character(X3), extract_part)
) %>%
drop_na(X1) %>%
select(-extract_name, -extract_part)
which returns
X1 X2 X3 new_X2 new_X3
1 111 House M. Bab A House M. Bab A
2 2 House M. Cac A - C House M. Cac A - C
3 121 Street M. Bak D Street M. Bak D
4 121 House M. Aba House M. Aba SMITH A
5 141 Garden Harris Garden Harris WHITE A - B
6 141 Villa Thomas Villa Thomas BURNEY B - D
Note: I don't think this approach is really stable regarding the regex used. For readability I filtered out some annoying rows containing NA and empty strings, you should remove those parts if necessary.

Here is how we could do it:
Credits to MartinGal for the regex "(?<=[A-Z])[A-Z]+") (upvote!)
Replace empty values with NA
Use lead to move rows up in X3 conditional on NA else not
filter if is not NA in X1
Extract the important information with str_extract and regex "(?<=[A-Z])[A-Z]+" -> combine this info with column X2 with str_c and finally coalesce both.
Remove the string to keep relevant one with regex and str_remove
library(dyplr)
library(stringr)
df %>%
mutate(across(everything(), ~sub("^\\s*$", NA, .)),
X3= ifelse(is.na(X3), lead(X2), X3)) %>%
filter(!is.na(X1)) %>%
mutate(X2 = coalesce(str_c(X2," ", str_extract(X3, "(?<=[A-Z])[A-Z]+")), X2),
X3 = str_remove_all(X3, "(?<=[A-Z])[A-Z]+"))
Output:
X1 X2 X3
1 111 House M. Bab A
2 2 House M. Cac A - C
3 121 Street M. Bak D
4 121 House M. Aba SMITH A
5 141 Garden Harris WHITE A - B
6 141 Villa Thomas BURNEY B - D

This is a yucky one:
# Retype the data and nullify empty values;
# use X1 as a key: intermediateResult => data.frame
intermediateResult <- data.frame(
lapply(
transform(
replace(df, df == "", NA_character_),
X1 = na.omit(X1)[cumsum(!is.na(X1))]
),
as.character
)
)
# Re-structure the data:
# interemdiateResult2 => data.frame
intermediateResult2 <- do.call(
rbind,
Filter(
function(y){
nrow(y) > 0
},
Map(
function(x){
z <- x[!is.na(x$X2),]
if(nrow(z) > 1 & is.na(z$X3[1])){
z$X3[1] <- z$X2[2]
head(z, 1)
}else{
z
}
},
with(
intermediateResult,
split(
intermediateResult,
paste(
X1,
cumsum(
(is.na(X2)
)
),
sep = " - "
)
)
)
)
)
)
# Regex it and hope for the best:
# result => data.frame
result <- data.frame(
transform(
intermediateResult2,
X2 = paste0(
X2,
ifelse(
(nchar(X3) == 1 | grepl("^\\w\\s+-\\s+\\w$", X3)),
"",
ifelse(
!(grepl("^\\w\\s+-\\s+\\w", X3)),
paste0(" ", substr(X3, 2, nchar(X3))),
paste0(" ", gsub("(^\\w\\s+-\\s+\\w)(.*)", "\\2", X3))
)
)
),
X3 = ifelse(
nchar(X3) == 1 | grepl("^\\w\\s+-\\s+\\w$", X3) ,
X3,
ifelse(
!(grepl("^\\w\\s+-\\s+\\w", X3)),
substr(X3, 1, 1),
gsub("(^\\w\\s+-\\s+\\w)(.*)", "\\1", X3)
)
)
),
row.names = NULL
)

Related

How to add column reporting sum of couple of subsequent rows

I have the following dataset
structure(list(Var1 = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L), .Label = c("0", "1"), class = "factor"), Var2 = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("congruent", "incongruent"
), class = "factor"), Var3 = structure(c(1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L), .Label = c("spoken", "written"), class = "factor"),
Freq = c(8L, 2L, 10L, 2L, 10L, 2L, 10L, 2L)), class = "data.frame", row.names = c(NA,
-8L))
I would like to add another column reporting sum of coupled subsequent rows. Thus the final result would look like this:
I have proceeded like this
Table = as.data.frame(table(data_1$unimodal,data_1$cong_cond, data_1$presentation_mode)) %>%
mutate(Var1 = factor(Var1, levels = c('0', '1')))
row = Table %>% #is.factor(Table$Var1)
summarise(across(where(is.numeric),
~ .[Var1 == '0'] + .[Var1 == '1'],
.names = "{.col}_sum"))
column = c(rbind(row$Freq_sum,rep(NA, 4)))
Table$column = column
But I am looking for the quickest way possible with no scripting separated codes. Here I have used the dplyr package, but if you might know possibly suggest some other ways with map(), for loop, and or the method you deem as the best, please just let me know.
This should do:
df$column <-
rep(colSums(matrix(df$Freq, 2)), each=2) * c(1, NA)
If you are fine with no NAs in the dataframe, you can
df %>%
group_by(Var2, Var3) %>%
mutate(column = sum(Freq))
# A tibble: 8 × 5
# Groups: Var2, Var3 [4]
Var1 Var2 Var3 Freq column
<fct> <fct> <fct> <int> <int>
1 0 congruent spoken 8 10
2 1 congruent spoken 2 10
3 0 incongruent spoken 10 12
4 1 incongruent spoken 2 12
5 0 congruent written 10 12
6 1 congruent written 2 12
7 0 incongruent written 10 12
8 1 incongruent written 2 12

Converting a dialogue tibble to .txt, and back again

I want to take a tibble that represents dialogue and turn it into a .txt that can be manually edited in a text editor and then returned to a tibble for processing.
The key challenge I've had is separating the blocks of text in a way that they can be re-imported to a similar format after editing while preserving the "Speaker" designation.
Speed is important as the volume of files and the length of each text segment are large.
Here's the input tibble:
tibble::tribble(
~word, ~speakerTag,
"been", 1L,
"going", 1L,
"on", 1L,
"and", 1L,
"what", 1L,
"your", 1L,
"goals", 1L,
"are.", 1L,
"Yeah,", 2L,
"so", 2L,
"so", 2L,
"John", 2L,
"has", 2L,
"15", 2L
)
Here's the desired output in a .txt:
###Speaker 1###
been going on and what your goals are.
###Speaker 2###
Yeah, so so John has 15
Here's the desired return after correcting errors manually:
~word, ~speakerTag,
"been", 1L,
"going", 1L,
"on", 1L,
"and", 1L,
"what", 1L,
"your", 1L,
"goals", 1L,
"in", 1L,
"r", 1L,
"Yeah,", 2L,
"so", 2L,
"so", 2L,
"John", 2L,
"hates", 2L,
"50", 2L
)
One way would be to add Speaker name "\n" at the start of each speakerTag
library(data.table)
library(dplyr)
library(tidyr)
setDT(df)[, word := replace(word, 1, paste0("\n\nSpeaker",
first(speakerTag), '\n\n', first(word))), rleid(speakerTag)]
We can write this in text file using
writeLines(paste(df$word, collapse = " "), 'Downloads/temp.txt')
It looks like this :
cat(paste(df$word, collapse = " "))
#Speaker1
#
#been going on and what your goals are.
#
#Speaker2
#
#Yeah, so so John has 15
To read it back in R, we can do :
read.table('Downloads/temp.txt', sep="\t", col.names = 'word') %>%
mutate(SpeakerTag = replace(word, c(FALSE, TRUE), NA)) %>%
fill(SpeakerTag) %>%
slice(seq(2, n(), 2)) %>%
separate_rows(word, sep = "\\s") %>%
filter(word != '')
# word SpeakerTag
#1 been Speaker1
#2 going Speaker1
#3 on Speaker1
#4 and Speaker1
#5 what Speaker1
#6 your Speaker1
#7 goals Speaker1
#8 are. Speaker1
#9 Yeah, Speaker2
#10 so Speaker2
#11 so Speaker2
#12 John Speaker2
#13 has Speaker2
#14 15 Speaker2
Obviously we can remove "Speaker" part in SpeakerTag column if it is not needed.

Replacing loop in dplyr R

So I am trying to program function with dplyr withou loop and here is something I do not know how to do
Say we have tv stations (x,y,z) and months (2,3). If I group by this say we get
this output also with summarised numeric value
TV months value
x 2 52
y 2 87
z 2 65
x 3 180
y 3 36
z 3 99
This is for evaluated Brand.
Then I will have many Brands I need to filter to get only those which get value >=0.8*value of evaluated brand & <=1.2*value of evaluated brand
So for example from this down I would only want to filter first two, and this should be done for all months&TV combinations
brand TV MONTH value
sdg x 2 60
sdfg x 2 55
shs x 2 120
sdg x 2 11
sdga x 2 5000
As #akrun said, you need to use a combination of merging and subsetting. Here's a base R solution.
m <- merge(df, data, by.x=c("TV", "MONTH"), by.y=c("TV", "months"))
m[m$value.x >= m$value.y*0.8 & m$value.x <= m$value.y*1.2,][,-5]
# TV MONTH brand value.x
#1 x 2 sdg 60
#2 x 2 sdfg 55
Data
data <- structure(list(TV = structure(c(1L, 2L, 3L, 1L, 2L, 3L), .Label = c("x",
"y", "z"), class = "factor"), months = c(2L, 2L, 2L, 3L, 3L,
3L), value = c(52L, 87L, 65L, 180L, 36L, 99L)), .Names = c("TV",
"months", "value"), class = "data.frame", row.names = c(NA, -6L
))
df <- structure(list(brand = structure(c(2L, 1L, 4L, 2L, 3L), .Label = c("sdfg",
"sdg", "sdga", "shs"), class = "factor"), TV = structure(c(1L,
1L, 1L, 1L, 1L), .Label = "x", class = "factor"), MONTH = c(2L,
2L, 2L, 2L, 2L), value = c(60L, 55L, 120L, 11L, 5000L)), .Names = c("brand",
"TV", "MONTH", "value"), class = "data.frame", row.names = c(NA,
-5L))

Calculating ratios by group with dplyr

Using the following dataframe I would like to group the data by replicate and group and then calculate a ratio of treatment values to control values.
structure(list(group = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("case", "controls"), class = "factor"), treatment = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "EPA", class = "factor"),
replicate = structure(c(2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L), .Label = c("four",
"one", "three", "two"), class = "factor"), fatty_acid_family = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "saturated", class = "factor"),
fatty_acid = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "14:0", class = "factor"),
quant = c(6.16, 6.415, 4.02, 4.05, 4.62, 4.435, 3.755, 3.755
)), .Names = c("group", "treatment", "replicate", "fatty_acid_family",
"fatty_acid", "quant"), class = "data.frame", row.names = c(NA,
-8L))
I have tried using dplyr as follows:
group_by(dataIn, replicate, group) %>% transmute(ratio = quant[group=="case"]/quant[group=="controls"])
but this results in Error: incompatible size (%d), expecting %d (the group size) or 1
Initially I thought this might be because I was trying to create 4 ratios from a df 8 rows deep and so I thought summarise might be the answer (collapsing each group to one ratio) but that doesn't work either (my understanding is a shortcoming).
group_by(dataIn, replicate, group) %>% summarise(ratio = quant[group=="case"]/quant[group=="controls"])
replicate group ratio
1 four case NA
2 four controls NA
3 one case NA
4 one controls NA
5 three case NA
6 three controls NA
7 two case NA
8 two controls NA
I would appreciate some advice on where I'm going wrong or even if this can be done with dplyr.
Thanks.
You can try:
group_by(dataIn, replicate) %>%
summarise(ratio = quant[group=="case"]/quant[group=="controls"])
#Source: local data frame [4 x 2]
#
# replicate ratio
#1 four 1.078562
#2 one 1.333333
#3 three 1.070573
#4 two 1.446449
Because you grouped by replicate and group, you could not access data from different groups at the same time.
#talat's answer solved for me. I created a minimal reproducible example to help my own understanding:
df <- structure(list(a = c("a", "a", "b", "b", "c", "c", "d", "d"),
b = c(1, 2, 1, 2, 1, 2, 1, 2), c = c(22, 15, 5, 0.2, 107,
6, 0.2, 4)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
# a b c
# 1 a 1 22.0
# 2 a 2 15.0
# 3 b 1 5.0
# 4 b 2 0.2
# 5 c 1 107.0
# 6 c 2 6.0
# 7 d 1 0.2
# 8 d 2 4.0
library(dplyr)
df %>%
group_by(a) %>%
summarise(prop = c[b == 1] / c[b == 2])
# a prop
# 1 a 1.466667
# 2 b 25.000000
# 3 c 17.833333
# 4 d 0.050000

Preserve NA in output of ifelse statement using paste

I have data that is organized like below M1 - M4, and I use the code from here to generate M_NEW:
M1 M2 M3 M4 M_NEW
1 1,2 0 1 1
3,4 3,4 1,2,3,4 4 3,4
NA NA 1 2 NA
It looks for a specified number of occurneces of number in the four columns and reports those numbers in M_NEW. Now, I would like to include the numbers 0 and 21 to each of the observations, unless that observation is NA. However, so far, I am unable to paste 0 and 21 to the observations, without also pasting them the NA values. The desired output is include in df below as M_NEW1. How can this be accomplished? It appears that I am missing something with paste here.
# sample data
df <- structure(list(M1 = structure(c(3L, 4L, 2L, 2L, 1L, 5L, NA, 6L
), .Label = c("0", "1", "1,2", "1,2,3,4", "1,2,3,4,5", "3,4,5,6,7"
), class = "factor"), M2 = structure(c(3L, NA, 2L, 2L, 1L, 4L,
NA, 5L), .Label = c("0", "1,2", "1,2,3,4,5", "4,5,6", "4,5,6,7,8,9,10,11,12,13,14"
), class = "factor"), M3 = structure(c(3L, NA, 1L, 1L, 1L, 2L,
NA, 4L), .Label = c("0", "1,2,3,4", "1,2,3,4,5", "1,2,3,4,5,6,7,8"
), class = "factor"), M4 = structure(c(3L, NA, 1L, 2L, 1L, 5L,
NA, 4L), .Label = c("0", "1", "1,2,3,4,5,6", "1,2,3,4,5,6,7,8,9,10,11,12",
"4,5"), class = "factor"), M_NEW1 = structure(c(3L, NA, 1L, 2L,
1L, 5L, NA, 4L), .Label = c("0,21", "1,0,21", "1,2,3,4,5,0,21",
"3,4,5,6,7,8,0,21", "4,5,0,21"), class = "factor")), .Names = c("M1",
"M2", "M3", "M4", "M_NEW1"), class = "data.frame", row.names = c(NA,
-8L))
# function slightly modified from https://stackoverflow.com/a/23203159/1670053
f <- function(x, n=3) {
tab <- table(strsplit(paste(x, collapse=","), ","))
res <- paste(names(tab[which(tab >= n)]), collapse=",")
return(ifelse(is.na(res), NA, ifelse(res == 0, "0,21", paste(res,",0,21",sep=""))))
#return(ifelse(is.na(res), ifelse(res == 0, "0,21", NA), paste(res,",0,21",sep=""))) #https://stackoverflow.com/a/17554670/1670053
#return(ifelse(is.na(res), NA, ifelse(res == 0, "0,21", paste(na.omit(res),",0,21",sep=""))))
#return(ifelse(is.na(res), as.character(NA), ifelse(res == 0, "0,21", paste(res,",0,21",sep=""))))
}
df$M_NEW2 <- apply(df[, 1:4], 1, f))
You can add another if else statement - rather inelegant but gets you there.
f2 <- function(x, n=3) {
tab <- table(strsplit(paste(x, collapse=","), ","))
res <- paste(names(tab[which(tab >= n)]), collapse=",")
res <- ifelse(res %in% c("0", ""), "0,21", res)
if(res %in% c("NA","0,21")) res else paste(res, "0,21", sep=",")
}
apply(df[1:4], 1, f2)
# "1,2,3,4,5,0,21" "NA" "0,21" "1,0,21" "0,21" "4,5,0,21" "NA"
# "3,4,5,6,7,8,0,21"

Resources