New column containing string that appears the most in the row - r

Im trying to Create a column with the string that appears the most in the row and create another column with the number of times this most prevalent string appeared.
To facilitate my question this is what im trying to achieve:
My actual DF
What im trying to obtain:
most prevalente category and count
example df:
d
f <- data.frame(ID = 1:4,
V1 = c("A","B","C","D"),
V2 = c("A", "B","D","B"),
V3 = c("A","C","D","B"))

Here is another way:
count <- sapply(apply(f[, -1], 1, table), max)
count
# [1] 3 2 2 2
category <- names(sapply(apply(f[, -1], 1, table), which.max))
category
# [1] "A" "B" "D" "B"
f2 <- data.frame(f, category, count)
f2
# ID V1 V2 V3 category count
# 1 1 A A A A 3
# 2 2 B B C B 2
# 3 3 C D D D 2
# 4 4 D B B B 2

df <- data.frame(ID = 1:4,
V1 = c("A","B","C","D"),
V2 = c("A", "B","D","B"),
V3 = c("A","C","D","B"))
library(data.table)
setDT(df)
other <- melt(df, id.vars = "ID", measure.vars = c("V1", "V2", "V3"))
other <- other[, .N, by = .(ID, value)]
colnames(other) <- c("ID", "category", "count")
other <- other[, .SD[which.max(count)], by = .(ID)]
res <- merge(df, other, by = c("ID"))
res

We can use dplyr rowwise function to apply table to each row from V1:V3
library(dplyr)
df |> rowwise() |>
mutate(category = names(table(c_across(V1:V3)))[which.max(table(c_across(V1:V3)))] ,
count = max(table(c_across(V1:V3))))
Output
# A tibble: 4 × 6
# Rowwise:
ID V1 V2 V3 category count
<int> <chr> <chr> <chr> <chr> <int>
1 1 A A A A 3
2 2 B B C B 2
3 3 C D D D 2
4 4 D B B B 2

Related

Joining two incomplete data.tables with the same column names

I have two incomplete data.tables with the same column names.
dt1 <- data.table(id = c(1, 2, 3), v1 = c("w", "x", NA), v2 = c("a", NA, "c"))
dt2 <- data.table(id = c(2, 3, 4), v1 = c(NA, "y", "z"), v2 = c("b", "c", NA))
They look like this:
dt1
id v1 v2
1: 1 w a
2: 2 x <NA>
3: 3 <NA> c
> dt2
id v1 v2
1: 2 <NA> b
2: 3 y c
3: 4 z <NA>
Is there a way to merge the two by filling in the missing info?
This is the result I'm after:
id v1 v2
1: 1 w a
2: 2 x b
3: 3 y c
4: 4 z <NA>
I've tried various data.table joins, merges but I either get the columns repeated:
> merge(dt1,
+ dt2,
+ by = "id",
+ all = TRUE)
id v1.x v2.x v1.y v2.y
1: 1 w a <NA> <NA>
2: 2 x <NA> <NA> b
3: 3 <NA> c y c
4: 4 <NA> <NA> z <NA>
or the rows repeated:
> merge(dt1,
+ dt2,
+ by = names(dt1),
+ all = TRUE)
id v1 v2
1: 1 w a
2: 2 <NA> b
3: 2 x <NA>
4: 3 <NA> c
5: 3 y c
6: 4 z <NA>
Both data.tables have the same column names.
You can group by ID and get the unique values after omitting NAs, i.e.
library(data.table)
merge(dt1, dt2, all = TRUE)[,
lapply(.SD, function(i)na.omit(unique(i))),
by = id][]
# id v1 v2
#1: 1 w a
#2: 2 x b
#3: 3 y c
#4: 4 z <NA>
You could also start out with rbind():
rbind(dt1, dt2)[, lapply(.SD, \(x) unique(x[!is.na(x)])), by = id]
# id v1 v2
# <num> <char> <char>
# 1: 1 w a
# 2: 2 x b
# 3: 3 y c
# 4: 4 z <NA>
First full_join and after that group_by per id and merge the rows:
library(dplyr)
library(tidyr)
dt1 %>%
full_join(dt2, by = c("id", "v1", "v2")) %>%
group_by(id) %>%
fill(starts_with('v'), .direction = 'updown') %>%
slice(1) %>%
ungroup
Output:
# A tibble: 4 × 3
id v1 v2
<dbl> <chr> <chr>
1 1 w a
2 2 x b
3 3 y c
4 4 z NA

R - how to create a loop for a unique word frequency count

I have the following dataframe:
df <- data.frame(q = c("a, b, c", "a, b, d"), combined = c("big big sentence","I like sentences"))
q combined
1 a, b, c big big sentence
2 a, b, d I like sentences
I am looking to count the frequency of each unique word per unique q. The desired output looks like:
words freq V1 V2 V3
1 big 2 a b c
2 sentence 1 a b c
3 I 1 a b d
4 like 1 a b d
5 sentences 1 a b d
I managed to write some code to do this for the first row of df only. How can I transfer this code to a loop, so that is does the data manipulation steps for each of the rows in df?
The code I wrote for 1 row which works:
df_1 <- df[1,]
countdf <- data.frame(table(unlist(strsplit(tolower(df_1$combined), " "))))
countsplit <- str_split_fixed(df_1$q, ",", 3)
countsplit <- as.data.frame(countsplit)
countdf$V1 <- countsplit$V1
countdf$V2 <- countsplit$V2
countdf$V3 <- countsplit$V3
library(tidyverse)
df <- data.frame(q = c("a, b, c", "a, b, d"), combined = c("big big sentence","I like sentences"))
df %>%
as_tibble() %>%
transmute(q, words = combined %>% map(~ .x %>% str_split(" ") %>% simplify)) %>%
unnest(words) %>%
separate(q, into = c("V1", "V2", "V3")) %>%
count(V1, V2, V3, words, name = "freq")
#> # A tibble: 5 x 5
#> V1 V2 V3 words freq
#> <chr> <chr> <chr> <chr> <int>
#> 1 a b c big 2
#> 2 a b c sentence 1
#> 3 a b d I 1
#> 4 a b d like 1
#> 5 a b d sentences 1
Created on 2022-02-22 by the reprex package (v2.0.0)
You can use separate_rows and separate:
library(tidyr)
library(dplyr)
df %>%
separate_rows(combined) %>%
group_by(q, words = combined) %>%
summarise(freq = n()) %>%
separate(q, into = c("V1", "V2", "V3"))
# A tibble: 5 x 5
V1 V2 V3 words freq
<chr> <chr> <chr> <chr> <int>
1 a b c big 2
2 a b c sentence 1
3 a b d I 1
4 a b d like 1
5 a b d sentences 1

grouping multiple columns and bringing values into character string

This is an example we can work with:
df <- tibble(y = c("a", "a", "a", "a", "a", "a"), z = c("b", "b", "b", "b", "b", "b"), a = c("aaa", "aaa", "aaa", "bbb", "bbb", "bbb"),
b = c(1,2,3,1,2,3), c = c(5,10,15,100,95,90))
df
# A tibble: 6 x 5
y z a b c
<chr> <chr> <chr> <dbl> <dbl>
1 a b aaa 1 5
2 a b aaa 2 10
3 a b aaa 3 15
4 a b bbb 1 100
5 a b bbb 2 95
6 a b bbb 3 90
I want to group the values in column y, z and a and combine column b and c to a single string. The final result should look exactly like this:
# A tibble: 2 x 4
y z a result
<chr> <chr> <chr> <chr>
1 a b aaa {"1":5,"2":10,"3":15}
2 a b bbb {"1":100,"2":95,"3":90}
Which i can almost achieve with:
b <- by(df[-1:-3], df$a, function(x)
sprintf("{%s}", toString(Reduce(paste0, c(x, "\"", "\":")[c(3, 1, 4, 2)]))))
data.frame(a=unique(df$a), result=do.call(rbind, as.list(b)), row.names=NULL)
a result
1 aaa {"1":5, "2":10, "3":15}
2 bbb {"1":100, "2":95, "3":90}
This only groups by column a, though and not by all three (y, z and a) columns. I got the hint that i can do fix it with the aggregate function but have a hard time appying it.
Using dplyr you can make use sprintf/paste0 :
library(dplyr)
df %>%
group_by(y, z, a) %>%
summarise(result = paste0('{', toString(sprintf('"%d":"%d"', b, c)), '}')) %>%
ungroup %>% data.frame()
# y z a result
#1 a b aaa {"1":"5", "2":"10", "3":"15"}
#2 a b bbb {"1":"100", "2":"95", "3":"90"}
Using by this can be written as :
do.call(rbind, by(df, list(df$y, df$z, df$a), function(x)
cbind(unique(x[1:3]),
result = paste0('{', toString(sprintf('"%d":"%d"', x$b, x$c)), '}'))))

Shiny: Join Cells in R

I would like to make this table:
Look like this:
Using dplyr:
df <- tibble(id = c(1,1,3),
b = c("foo", "bar", "foo"),
c = c("x", "y", "z"))
df
# A tibble: 3 x 3
id b c
<dbl> <chr> <chr>
1 1 foo x
2 1 bar y
3 3 foo z
df %>% group_by(id) %>%
summarize(new = paste(b, collapse = ","),
new2 = paste(c, collapse = ","))
which results in:
# A tibble: 2 x 3
a new new2
<dbl> <chr> <chr>
1 1 foo,bar x,y
2 3 foo z

Number of non-NA records by column, grouped

I have a data.table that looks something like this:
> dt <- data.table(
group1 = c("a", "a", "a", "b", "b", "b", "b"),
group2 = c("x", "x", "y", "y", "z", "z", "z"),
data1 = c(NA, rep(T, 3), rep(F, 2), "sometimes"),
data2 = c("sometimes", rep(F,3), rep(T,2), NA))
> dt
group1 group2 data1 data2
1: a x NA sometimes
2: a x TRUE FALSE
3: a y TRUE FALSE
4: b y TRUE FALSE
5: b z FALSE TRUE
6: b z FALSE TRUE
7: b z sometimes NA
My goal is to find the number of non-NA records in each data column, grouped by group1 and group2.
group1 group2 data1 data2
1: a x 1 2
3: a y 1 1
4: b y 1 1
5: b z 3 2
I have this code left over from dealing with another part of the dataset, which had no NAs and was logical:
dt[
,
lapply(.SD, sum),
by = list(group1, group2),
.SDcols = c("data3", "data4")
]
But it won't work with NA values, or non-logical values.
dt[, lapply(.SD, function(x) sum(!is.na(x))), by = .(group1, group2)]
# group1 group2 data1 data2
#1: a x 1 2
#2: a y 1 1
#3: b y 1 1
#4: b z 3 2
Another alternative is to melt/dcast in order to avoid by column operation. This will remove the NAs and use the length function by default
dcast(melt(dt, id = c("group1", "group2"), na.rm = TRUE), group1 + group2 ~ variable)
# Aggregate function missing, defaulting to 'length'
# group1 group2 data1 data2
# 1: a x 1 2
# 2: a y 1 1
# 3: b y 1 1
# 4: b z 3 2
Using dplyr (with some help from David Arenburg & eddi):
library(dplyr)
dt %>% group_by(group1, group2) %>% summarise_each(funs(sum(!is.na(.))))
Source: local data table [4 x 4]
Groups: group1
group1 group2 data1 data2
1 a x 1 2
2 a y 1 1
3 b y 1 1
4 b z 3 2

Resources