Match bigram frequencies to bigram tokens across multiple columns - r

I have two dataframes, one is a frequency list with bigram frequencies:
F_bigrams <- structure(list(word_tag = c("it_PNP 's_VBZ", "do_VDB n't_XX0",
"that_DT0 's_VBZ", "you_PNP know_VVB", "i_PNP 'm_VBB", "i_PNP do_VDB",
"in_PRP the_AT0", "i_PNP 've_VHB", "'ve_VHB got_VVN", "i_PNP mean_VVB"
), Freq_bigr = c(31831L, 26273L, 21691L, 14157L, 14010L, 12904L,
10994L, 10543L, 10089L, 9856L)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"))
The other contains bigram tokens:
df <- data.frame(
bigr_1_2 = c("i_PNP 'm_VBB", NA, NA, NA),
bigr_2_3 = c("it_PNP 's_VBZ", "'ve_VHB got_VVN", NA, NA),
bigr_3_4 = c("you_PNP know_VVB", "it_PNP 's_VBZ", "'ve_VHB got_VVN", NA)
)
I want to match the frquencies from the frequency list F_bigrams to each bigram token in df. This I can do without problems in df, which is a tiny snippet of the actual data, with this base R method:
df[, paste0("f_bigr_", 1:3, "_", 2:4)] <- sapply(df[, 1:3], function(x) F_bigrams$Freq_bigr[match(x, F_bigrams$word_tag)])
However, in the actual data, which has far more columns and half a million rows, I consistently get the number 2 where there should be NA. Why is that? And, more importantly, is there an alternative way to match the frequencies to their respective bigram tokens?

df %>%
rowid_to_column() %>%
pivot_longer(-rowid, values_to = 'word_tag', values_drop_na = TRUE) %>%
left_join(F_bigrams) %>%
pivot_wider(rowid, values_from = c(word_tag, Freq_bigr))
rowid word_tag_bigr_1_2 word_tag_bigr_2_3 word_tag_bigr_3_4 Freq_bigr_bigr_1_2 Freq_bigr_bigr_2_3 Freq_bigr_bigr_3_4
<int> <chr> <chr> <chr> <int> <int> <int>
1 1 i_PNP 'm_VBB it_PNP 's_VBZ you_PNP know_VVB 14010 31831 14157
2 2 NA 've_VHB got_VVN it_PNP 's_VBZ NA 10089 31831
3 3 NA NA 've_VHB got_VVN NA NA 10089

Related

error in `na_if()`: ! Can't convert `y` <character> to match type of `x` <double>

I have a dataframe df_3 from which I want to mutate multiple columns starting with Team_. I want to replace 0s contained in the columns with NA. I use a code which I have previously successfully used but now gives me the following error:
Error in `mutate()`:
ℹ In argument: `across(starts_with("Team_"), ~na_if(., "0"))`.
Caused by error in `across()`:
! Can't compute column `Team_Num_1`.
Caused by error in `na_if()`:
! Can't convert `y` <character> to match type of `x` <double>.
Backtrace:
1. df_3 %>% mutate(across(starts_with("Team_"), ~na_if(., "0")))
10. dplyr::na_if(Team_Num_1, "0")
Any idea why that is or how I can solve it? I did not change anything in the original df and the code used to run through before hand, not sure what has changed.
Replicable code:
structure(list(Team_1 = c("0", "werg", "sdf"), Team_Desc_1 = c("wer",
"wtrb", "wergt"), Team_URL_1 = c("ewrg", "werg", "asd"), Team_Ver_1 = c("25",
"2523", "342"), Team_Num_1 = c(0, 23, 12), Team_Value_1 = c("aed",
"jfsa", "vsf"), Name_1 = c("etwbv", "werg", "sdfg"), Txt_1 = c("abc",
"bfh", "fse"), Head_1 = c("abc1", "bfh", "fse"), Team_2 = c("werh",
"wtt", "qwe"), Team_Desc_2 = c("sdfg", "wer", "sdfgv"), Team_URL_2 = c("qwe",
"gvre", "vrw"), Team_Ver_2 = c("4123", "5133", "4126"), Team_Num_2 = c(3,
0, 123), Team_Value_2 = c("aewed", "jfsbwa", "vsbf"), Name_2 = c("qwreg",
"gvr", "wref"), Txt_2 = c("rege", "wer", "vwr"), Head_2 = c("rege1",
"wer", "vwr")), row.names = c(NA, -3L), class = c("tbl_df", "tbl",
"data.frame"))
According to the changelog for dplyr 1.1.0, na_if() now uses the vctrs package, which is stricter about type stability:
na_if() (#6329) now casts y to the type of x before comparison, which makes it clearer that this function is type and size stable on x.
So instead, use na_if(x, "0"):
library(dplyr)
dat %>%
mutate(across(starts_with("Team_"), ~ na_if(.x, "0")))
# # A tibble: 3 × 18
# Team_1 Team_Desc_1 Team_UR…¹ Team_…² Team_…³ Team_…⁴ Name_1 Txt_1 Head_1 Team_2
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 NA wer ewrg 25 aed aed etwbv abc abc1 werh
# 2 werg wtrb werg 2523 jfsa jfsa werg bfh bfh wtt
# 3 sdf wergt asd 342 vsf vsf sdfg fse fse qwe
# # … with 8 more variables: Team_Desc_2 <chr>, Team_URL_2 <chr>,
# # Team_Ver_2 <chr>, Team_Num_2 <chr>, Team_Value_2 <chr>, Name_2 <chr>,
# # Txt_2 <chr>, Head_2 <chr>, and abbreviated variable names ¹​Team_URL_1,
# # ²​Team_Ver_1, ³​Team_Num_1, ⁴​Team_Value_1
If you have a mix of character and numeric columns, you could do:
dat2 <- tibble(
Team_1 = c("0", "werg", "sdf"),
Team_Desc_1 = c(0, 3, 4),
Name_1 = c("etwbv", "werg", "sdfg")
)
dat2 %>%
mutate(
across(starts_with("Team_") & where(is.character), ~ na_if(.x, "0")),
across(starts_with("Team_") & where(is.numeric), ~ na_if(.x, 0)),
)
# # A tibble: 3 × 3
# Team_1 Team_Desc_1 Name_1
# <chr> <dbl> <chr>
# 1 NA NA etwbv
# 2 werg 3 werg
# 3 sdf 4 sdfg

How to unnest a data frame containing list of list with varied length?

I was trying to unnest the the following data frame.
df.org <- structure(list(Gene = "ARIH1", Description = "E3 ubiquitin-protein ligase ARIH1",
condition2_cellline = list(c("MCF7", "Jurkat")), condition2_activity = list(
c(40.8284023668639, 13.26973)), condition2_concentration = list(
c("100uM", "100uM")), condition3_cellline = list("Jurkat"),
condition3_activity = list(-4.60251), condition3_concentration = list(
"100uM")), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))
This is my code:
df.output <- df.ori %>%
unnest(where(is.list), keep_empty = T)
This is what I got:
structure(list(Gene = c("ARIH1", "ARIH1"), Description = c("E3 ubiquitin-protein ligase ARIH1",
"E3 ubiquitin-protein ligase ARIH1"), condition2_cellline = c("MCF7",
"Jurkat"), condition2_activity = c(40.8284023668639, 13.26973
), condition2_concentration = c("100uM", "100uM"), condition3_cellline = c("Jurkat",
"Jurkat"), condition3_activity = c(-4.60251, -4.60251), condition3_concentration = c("100uM",
"100uM")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-2L))
Is there a way to avoid duplicating those variables with a shorter length? The following output is what I want to get.
df.desired <- structure(list(Gene = c("ARIH1", "ARIH1"), Description = c("E3 ubiquitin-protein ligase ARIH1",
"E3 ubiquitin-protein ligase ARIH1"), condition2_cellline = c("MCF7",
"Jurkat"), condition2_activity = c(40.8284023668639, 13.26973
), condition2_concentration = c("100uM", "100uM"), condition3_cellline = c(NA,
"Jurkat"), condition3_activity = c(NA, -4.60251), condition3_concentration = c(NA,
"100uM")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-2L))
Thanks so much for any help!
We could also do without reshaping i.e. get the max of the list column lengths in a column, then loop across those list columns, modify the length with the max value and use unnest
library(dplyr)
library(purrr)
library(tidyr)
df.org %>%
mutate(l1 = max(across(where(is.list), lengths)),
across(where(is.list), ~ map(.x, `length<-`, l1)), l1 = NULL) %>%
unnest(where(is.list), keep_empty = TRUE)
-output
# A tibble: 2 × 8
Gene Description condition2_cellline condition2_activity condition2_concentration condition3_cellline condition3_activity condition3_concentration
<chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
1 ARIH1 E3 ubiquitin-protein ligase ARIH1 MCF7 40.8 100uM Jurkat -4.60 100uM
2 ARIH1 E3 ubiquitin-protein ligase ARIH1 Jurkat 13.3 100uM <NA> NA <NA>
Here is suggestion how it could work.
We pivot_longer all listed columns.
apply the the function to create lists of same length
pivot back and unnest.
library(dplyr)
library(tidyr)
df.org %>%
pivot_longer(cols = starts_with("condition")) %>%
mutate(value = lapply(value, `length<-`, max(lengths(value)))) %>%
pivot_wider(names_from = name, values_from = value) %>%
unnest(cols = c(condition2_cellline, condition2_activity, condition2_concentration,
condition3_cellline, condition3_activity, condition3_concentration))
Gene Description condition2_cell~ condition2_acti~ condition2_conc~ condition3_cell~ condition3_acti~ condition3_conc~
<chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
1 ARIH1 E3 ubiquitin-prot~ MCF7 40.8 100uM Jurkat -4.60 100uM
2 ARIH1 E3 ubiquitin-prot~ Jurkat 13.3 100uM NA NA NA
>

Perform a series of mutations to columns in dataframe

I am trying to replace some text in my dataframe (a few rows given below)
> dput(Henry.longer[1:4,])
structure(list(N_l = c(4, 4, 4, 4), UG = c("100", "100", "100",
"100"), S = c(12, 12, 12, 12), Sample = c(NA, NA, NA, NA), EQ = c("Henry",
"Henry", "Henry", "Henry"), DF = c(0.798545454545455, 0.798545454545455,
0.798545454545455, 0.798545454545455), meow = c("Henry.Exterior.single",
"Multi", "Henry.Exterior.multi", "Henry.Interior.single"), Girder = c("Henry.Exterior.single",
"Henry.Interior.multi", "Henry.Exterior.multi", "Interior")), row.names = c(NA,
-4L), groups = structure(list(UG = "100", S = 12, .rows = list(
1:4)), row.names = c(NA, -1L), class = c("tbl_df", "tbl",
"data.frame"), .drop = FALSE), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
I try to mutate the dataframe as:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.multi", "Multi")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.multi", "Multi")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.multi", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.single", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.multi", "Interior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.single", "Interior")) %>%
select(-meow)
But for some reason the results does not get applied to all the rows and only:
N_l UG S Sample EQ DF Loading Girder
1 4 100 12 NA Henry 0.799 Henry.Exterior.single Henry.Exterior.single
2 4 100 12 NA Henry 0.799 Multi Henry.Interior.multi
3 4 100 12 NA Henry 0.799 Henry.Exterior.multi Henry.Exterior.multi
4 4 100 12 NA Henry 0.799 Henry.Interior.single Interior
I think we can use lookup vectors for this, if it's easy or safer to use static string lookups:
tr_vec <- c(Henry.Exterior.single = "Single", Henry.Exterior.multi = "Multi", Henry.Interior.single = "Single", Henry.Interior.multi = "Multi")
tr_vec2 <- c(Henry.Exterior.multi = "Exterior", Henry.Exterior.single = "Exterior", Henry.Interior.multi = "Interior", Henry.Interior.single = "Interior")
Henry.longer %>%
mutate(
Loading = coalesce(tr_vec[Loading], Loading),
Girder = coalesce(tr_vec2[Girder], Girder)
)
# # A tibble: 4 x 8
# # Groups: UG, S [1]
# N_l UG S Sample EQ DF Loading Girder
# <dbl> <chr> <dbl> <lgl> <chr> <dbl> <chr> <chr>
# 1 4 100 12 NA Henry 0.799 Single Exterior
# 2 4 100 12 NA Henry 0.799 Multi Interior
# 3 4 100 12 NA Henry 0.799 Multi Exterior
# 4 4 100 12 NA Henry 0.799 Single Interior
The advantage of RonakShah's regex solution is that it can very easily handle many of the types of substrings you appear to need. Regexes do carry a little risk, though, in that they may (unlikely in that answer, but) miss match.
Instead of using str_replace I guess it would be easier to extract what you want using regex.
library(dplyr)
Henry.longer %>%
mutate(Loading = sub('.*\\.', '', meow),
Girder = sub('.*\\.(\\w+)\\..*', '\\1', meow))
where
Loading - removes everything until last dot
Girder - extracts a word between two dots.
Oh boy, looks like you've got some answers here already but here's a super-simple one that uses stringr::str_extract:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_extract(meow, "single|multi")) %>%
mutate(Girder = str_extract(meow, "Interior|Exterior"))
It's worth noting that the demo data has a weird entry for meow in one column, so it didn't run perfectly on my machine:

Iterate through columns' suffixes in a for loop. R

I am trying to modify my dataset with a for loop. I want to modify certain cells of some columns depending on the value of its "paired" column. My dataset could be:
data1989 <- data.frame("date" = c("1987-01-01", "1987-01-03", "1987-01-19"),
"NDVI_1" = c(NA, 0.589, 0.120),
"NDVI_3" = c(NA, 0.447, NA),
"NDVI_4" = c(NA, NA, NA),
"pixelQA_1" = c(NA, 66.897,90.599),
"pixelQA_3" = c(NA, 66.097,NA),
"pixelQA_4" = c(NA, NA, NA),
stringsAsFactors = FALSE)
> data1989
date NDVI_1 NDVI_3 NDVI_4 pixelQA_1 pixelQA_3 pixelQA_4
1 1987-01-01 NA NA NA NA NA NA
2 1987-01-03 0.589 0.447 NA 66.897 66.097 NA
3 1987-01-19 0.120 NA NA 90.599 NA NA
Columns are "paired" by the suffix of each column, so NDVI_1 is paired with pixelQA_1, and so on. I want to modify the values under NDVI's columns depending on it's "paired" values on pixelQA column, following:
if PixelQa is NA -> then NDVI should be also NA.
if Pixel Qa is 66±0.5 OR 130±0.5 -> then NDVI remains the same value.
if Pixel Qa is different to 66±0.5 OR 130±0.5 -> then NDVI value is set to NA (this is bad quality data which needs to be ignored).
Applying these very simple rules my data should look like:
data1989clean <- data.frame("date" = c("1987-01-01", "1987-01-03", "1987-01-19"),
"NDVI_1" = c(NA, NA, NA),
"NDVI_3" = c(NA, 0.447, NA),
"NDVI_4" = c(NA, NA, NA),
"pixelQA_1" = c(NA, 66.897,90.599),
"pixelQA_3" = c(NA, 66.097,NA),
"pixelQA_4" = c(NA, NA, NA),
stringsAsFactors = FALSE)
> data1989clean
date NDVI_1 NDVI_3 NDVI_4 pixelQA_1 pixelQA_3 pixelQA_4
1 1987-01-01 NA NA NA NA NA NA
2 1987-01-03 NA 0.447 NA 66.897 66.097 NA
3 1987-01-19 NA NA NA 90.599 NA NA
To reach my goal I am trying the following for loop:
for(i in 1:4){
data1989$NDVI_[i] <- ifelse(data1989$pixelQA_[i] < 66.5 & data1989$pixelQA_[i] > 65.5 |
data1989$pixelQA_[i] < 130.5 & data1989$pixelQA_[i] > 129.5,
data1989$NDVI_[i], NA)
}
But so far it is not working, as the dataset output looks exactly the same as the original one. Any suggestion will be welcomed.
As suggested by #George Savva, you can achieve this by pivoting longer, correcting the data, and pivoting back wider. So, using the tidyverse, that gives:
library(tidyverse)
newdd1 <-
#
data1989 %>%
#
pivot_longer(cols = -date,
names_to = c(".value", "set"),
names_sep = "_") %>%
#
mutate(NDVI = case_when(is.na(pixelQA) ~ NA_real_,
between(pixelQA, 65.5, 66.5) ~ NDVI,
between(pixelQA, 129.5, 130.5) ~ NDVI,
TRUE ~ NA_real_)) %>%
#
pivot_wider(names_from = set,
values_from = c(NDVI, pixelQA))

How to make a frequency table from a data frame in R

The data frame is like this:
enter image description here
header: system
Row 1: 00000000000000000503_0
Row 2: 00000000000000000503_1
Row 3: 00000000000000000503_2
Row 4: 00000000000000000503_3
Row 5: 000000000000000004e7_0
Row 6: 000000000000000004e7_1
Row 7: 00000000000000000681_0
Row 8: 00000000000000000681_1
Row 9: 00000000000000000681_2
I want to generate a frequency table with the quantities of the code before string "_" such that:
"00000000000000000503" appears 4 times, "000000000000000004e7" appears 2 times, and so on.
How do I do this in R?
Remove everything after underscore and use table to count frequency
table(sub("_.*", "", data$col1))
#Also
#table(sub("(.*)_.*", "\\1", data$col1))
#000000000000000004e7 00000000000000000503 00000000000000000681
# 2 4 3
If final output needs to be a dataframe use stack
stack(table(sub("_.*", "", data$col1)))
# values ind
#1 2 000000000000000004e7
#2 4 00000000000000000503
#3 3 00000000000000000681
data
data <- structure(list(col1 = structure(c(3L, 4L, 5L, 6L, 1L, 2L, 7L,
8L, 9L), .Label = c("000000000000000004e7_0", "000000000000000004e7_1",
"00000000000000000503_0", "00000000000000000503_1",
"00000000000000000503_2",
"00000000000000000503_3", "00000000000000000681_0",
"00000000000000000681_1",
"00000000000000000681_2"), class = "factor")), class = "data.frame",
row.names = c(NA, -9L))
A dplyr-tidyr alternative:
df %>%
tidyr::separate(V3, c("target", "non_target")) %>%
count(target)
# A tibble: 3 x 2
target n
<chr> <int>
1 000000000000000004e7 2
2 00000000000000000503 4
3 00000000000000000681 3
With base:
table(sapply(strsplit(df$system, "_"),"[[", 1))
Data:
df <- structure(list(V1 = c("Row", "Row", "Row", "Row", "Row", "Row",
"Row", "Row", "Row"), V2 = c("1:", "2:", "3:", "4:", "5:", "6:",
"7:", "8:", "9:"), V3 = c("00000000000000000503_0", "00000000000000000503_1",
"00000000000000000503_2", "00000000000000000503_3", "000000000000000004e7_0",
"000000000000000004e7_1", "00000000000000000681_0", "00000000000000000681_1",
"00000000000000000681_2")), class = "data.frame", row.names = c(NA,
-9L))
Another option using the stringr library that is included in tidyverse
> library(tidyverse)
> mydata <- data.frame(system = c("00000000000000000503_0",
"00000000000000000503_1",
"00000000000000000503_2",
"00000000000000000503_3",
"000000000000000004e7_0",
"000000000000000004e7_1",
"00000000000000000681_0",
"00000000000000000681_1",
"00000000000000000681_2"))
> mydata
system
1 00000000000000000503_0
2 00000000000000000503_1
3 00000000000000000503_2
4 00000000000000000503_3
5 000000000000000004e7_0
6 000000000000000004e7_1
7 00000000000000000681_0
8 00000000000000000681_1
9 00000000000000000681_2
> # Split data using str_split
> mydata$leftside <- sapply(mydata$system, function(x) unlist(str_split(x, "_"))[1]) #split string by the "_" and take first piece
> mydata$rightside <- sapply(mydata$system, function(x) unlist(str_split(x, "_"))[2]) #split string by the "_" and take second piece
>
> mydata
system leftside rightside
1 00000000000000000503_0 00000000000000000503 0
2 00000000000000000503_1 00000000000000000503 1
3 00000000000000000503_2 00000000000000000503 2
4 00000000000000000503_3 00000000000000000503 3
5 000000000000000004e7_0 000000000000000004e7 0
6 000000000000000004e7_1 000000000000000004e7 1
7 00000000000000000681_0 00000000000000000681 0
8 00000000000000000681_1 00000000000000000681 1
9 00000000000000000681_2 00000000000000000681 2
> # alternative tabulate fuction than base::table(). Can Provide nicer options.
> xtabs(data = mydata, formula = ~leftside)
leftside
000000000000000004e7 00000000000000000503 00000000000000000681
2 4 3
A tidyverse answer would be
my_data <- mydata %>%
mutate_if(is.factor, as.character) %>%
mutate(system = gsub('_[^_]*$', '', system)) %>%
group_by(system) %>%
count() %>%
ungroup()
my_data
An option with str_remove and group_by
library(stringr)
library(dplyr)
df %>%
group_by(V3 = str_remove(V3, "_\\d+$")) %>%
summarise(n = n())
# A tibble: 3 x 2
# V3 n
# <chr> <int>
#1 000000000000000004e7 2
#2 00000000000000000503 4
#3 00000000000000000681 3
Or in base R with table and trimws
table(trimws(df$V3, whitespace = "_[0-9]+"))
data
df <- structure(list(V1 = c("Row", "Row", "Row", "Row", "Row", "Row",
"Row", "Row", "Row"), V2 = c("1:", "2:", "3:", "4:", "5:", "6:",
"7:", "8:", "9:"), V3 = c("00000000000000000503_0", "00000000000000000503_1",
"00000000000000000503_2", "00000000000000000503_3", "000000000000000004e7_0",
"000000000000000004e7_1", "00000000000000000681_0", "00000000000000000681_1",
"00000000000000000681_2")), class = "data.frame", row.names = c(NA,
-9L))

Resources