how to count and remove similar strings across columns

how to count and remove similar strings across columns - r

I have a data with many columns . for example this is with three columns
df<-structure(list(V1 = structure(c(5L, 1L, 7L, 3L, 2L, 4L, 6L, 6L
), .Label = c("CPSIAAAIAAVNALHGR", "DLNYCFSGMSDHR", "FPEHELIVDPQR",
"IADPDAVKPDDWDEDAPSK", "LWADHGVQACFGR", "WGEAGAEYVVESTGVFTTMEK",
"YYVTIIDAPGHR"), class = "factor"), V2 = structure(c(5L, 2L,
7L, 3L, 4L, 6L, 1L, 1L), .Label = c("", "CPSIAAAIAAVNALHGR",
"GCITIIGGGDTATCCAK", "HVGPGVLSMANAGPNTNGSQFFICTIK", "LLELGPKPEVAQQTR",
"MVCCSAWSEDHPICNLFTCGFDR", "YYVTIIDAPGHR"), class = "factor"),
V3 = structure(c(4L, 3L, 2L, 4L, 3L, 1L, 1L, 1L), .Label = c("",
"AVCMLSNTTAIAEAWAR", "DLNYCFSGMSDHR", "FPEHELIVDPQR"), class = "factor")), .Names = c("V1",
"V2", "V3"), class = "data.frame", row.names = c(NA, -8L))
-The first column, we don't look at any other column, we just count how many strings there are and keep the unique one
The second column, we keep the unique and also we remove those that were already in the first column
The third column, we keep the unique and we remove the strings that were in the first and second column
This continues for as many columns as we have
for example for this data, we will have the following
Column 1 Column 2 Column 3
LWADHGVQACFGR
CPSIAAAIAAVNALHGR LLELGPKPEVAQQTR AVCMLSNTTAIAEAWAR
YYVTIIDAPGHR GCITIIGGGDTATCCAK
FPEHELIVDPQR HVGPGVLSMANAGPNTNGSQFFICTIK
DLNYCFSGMSDHR MVCCSAWSEDHPICNLFTCGFDR
IADPDAVKPDDWDEDAPSK
WGEAGAEYVVESTGVFTTMEK

Here is a solution via tidyverse,
library(tidyverse)
df1 <- df %>%
gather(var, string) %>%
filter(string != '' & !duplicated(string)) %>%
group_by(var) %>%
mutate(cnt = seq(n())) %>%
spread(var, string) %>%
select(-cnt)
Which gives
# A tibble: 7 x 4
cnt V1 V2 V3
* <int> <chr> <chr> <chr>
1 1 LWADHGVQACFGR LLELGPKPEVAQQTR AVCMLSNTTAIAEAWAR
2 2 CPSIAAAIAAVNALHGR GCITIIGGGDTATCCAK <NA>
3 3 YYVTIIDAPGHR HVGPGVLSMANAGPNTNGSQFFICTIK <NA>
4 4 FPEHELIVDPQR MVCCSAWSEDHPICNLFTCGFDR <NA>
5 5 DLNYCFSGMSDHR <NA> <NA>
6 6 IADPDAVKPDDWDEDAPSK <NA> <NA>
7 7 WGEAGAEYVVESTGVFTTMEK <NA> <NA>
You can use colSums to get the number of strings,
colSums(!is.na(df1))
#V1 V2 V3
# 7 4 1
A similar approach via base R, that would save the strings in a list would be,
df[] <- lapply(df, as.character)
d1 <- stack(df)
d1 <- d1[d1$values != '' & !duplicated(d1$values),]
l1 <- unstack(d1, values ~ ind)
lengths(l1)
#V1 V2 V3
# 7 4 1

A base R solution. df2 is the final output.
# Convert to character
L1 <- lapply(df, as.character)
# Get unique string
L2 <- lapply(L1, unique)
# Remove ""
L3 <- lapply(L2, function(vec){vec <- vec[!(vec %in% "")]})
# Use for loop to remove non-unique string from previous columns
for (i in 2:length(L3)){
previous_vec <- unlist(L3[1:(i - 1)])
current_vec <- L3[[i]]
L3[[i]] <- current_vec[!(current_vec %in% previous_vec)]
}
# Get the maximum column length
max_num <- max(sapply(L3, length))
# Append "" to each column
L4 <- lapply(L3, function(vec){vec <- c(vec, rep("", max_num - length(vec)))})
# Convert L4 to a data frame
df2 <- as.data.frame(do.call(cbind, L4))

Related

Count values above 0 and count how many match a pattern in a row (in R)

I would like to count how many rows in each column are >0 and how many of those rows (that are >0) start with "mt-".
The result should also be in a data frame.
Here is an example.
df1
mt-abc 1 0 2
mt-dca 1 1 2
cla 0 2 0
dla 0 3 0
result
above0 2 3 2
mt 2 1 2

In base R you can do :
mat <- df[-1] > 0
rbind(above0 = colSums(mat),
mt = colSums(startsWith(df$V1, 'mt') & mat))
# V2 V3 V4
#above0 2 3 2
#mt 2 1 2
Actual data has numbers in the column and names in rownames for which we can do :
mat <- df > 0
rbind(above0 = colSums(mat),
mt = colSums(startsWith(rownames(df), 'mt') & mat))
data
df <- structure(list(V1 = c("mt-abc", "mt-dca", "cla", "dla"), V2 = c(1L,
1L, 0L, 0L), V3 = 0:3, V4 = c(2L, 2L, 0L, 0L)), class = "data.frame",
row.names = c(NA, -4L))

I don't think this is the most elegant approach in the tidyverse, but just out of curiosity:
library(tidyverse)
my_df <- data.frame(
stringsAsFactors = FALSE,
var = c("mt-abc", "mt-dca", "cla", "dla"),
x = c(1L, 1L, 0L, 0L),
y = c(0L, 1L, 2L, 3L),
z = c(2L, 2L, 0L, 0L)
)
df_1 <- my_df %>%
summarize(across(.cols=x:z, .fn=~sum(.x > 0))) %>%
mutate(var="above0")
df_2 <- my_df %>%
filter(str_detect(var, "^mt")) %>%
summarise(across(.cols=x:z, .fn=~sum(.x > 0))) %>%
mutate(var="mt")
bind_rows(df_1, df_2)
#> x y z var
#> 1 2 3 2 above0
#> 2 2 1 2 mt
Created on 2020-12-04 by the reprex package (v0.3.0)

Rowwise median for multiple columns using dplyr

Given the following dataset, I want to compute for each row the median of the columns M1,M2 and M3. I am looking for a solution where the final column is added to the dataframe under the name 'Median'. The column names (M1:M3) should not be used directly (in the original dataset, there are many more columns, not just 3).
# A tibble: 8 x 5
I1 M1 M2 I2 M3
<int> <int> <int> <int> <int>
1 3 4 5 3 5
2 2 2 2 2 1
3 2 2 2 2 2
4 3 1 3 3 1
5 2 1 3 3 1
6 3 2 4 4 3
7 3 1 3 4 1
8 2 1 3 2 3
You can load the dataset using:
df = structure(list(I1 = c(3L, 2L, 2L, 3L, 2L, 3L, 3L, 2L), M1 = c(4L,
2L, 2L, 1L, 1L, 2L, 1L, 1L), M2 = c(5L, 2L, 2L, 3L, 3L, 4L, 3L,
3L), I2 = c(3L, 2L, 2L, 3L, 3L, 4L, 4L, 2L), M3 = c(5L, 1L, 2L,
1L, 1L, 3L, 1L, 3L)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -8L), .Names = c("I1", "M1", "M2", "I2",
"M3"))
I know that several similar questions have already been asked. However, most solutions posted use rowMeans or rowSums. I'm looking for a solution where:
no 'row-function' can be used.
the solution is a simple dplyr solution
The reason for (2) is that I am teaching the 'tidyverse' to total beginners.

We could use rowMedians
library(matrixStats)
library(dplyr)
df %>%
mutate(Median = rowMedians(as.matrix(.[grep('M\\d+', names(.))])))
Or if we need to use only tidyverse functions, convert it to 'long' format with gather, summarize by row and get the median of the 'value' column
df %>%
rownames_to_column('rn') %>%
gather(key, value, starts_with('M')) %>%
group_by(rn) %>%
summarise(Median = median(value)) %>%
ungroup %>%
select(-rn) %>%
bind_cols(df, .)
Or another option is rowwise() from dplyr (hope the row is not a problem)
df %>%
rowwise() %>%
mutate(Median = median(c(!!! rlang::syms(grep('M', names(.), value=TRUE)))))

Given a dataframe df with some numeric values:
df <- structure(list(X0 = c(0.82046171427112, 0.836224720981912, 0.842547521493854,
0.848014287631906, 0.850943494153631, 0.85425398956647, 0.85616876970771,
0.856855792247478, 0.857471048654811, 0.857507363153284, 0.874487063791594,
1.70684558846347, 1.95711031206168, 6.84386713155156), X1 = c(0.755674148966666,
0.765242580861224, 0.774422478168495, 0.776953642833977, 0.778128315184819,
0.778611604461183, 0.778624581647491, 0.778454002430202, 1.52708579075974,
13.0356519295685, 18.0590093408357, 21.1371199340156, 32.4192814934364,
33.2355314147089), X2 = c(0.772236670327724, 0.788112332251601,
0.797695511542613, 0.804257521548174, 0.809815828400878, 0.816592605516508,
0.819421106011397, 0.821734473885381, 0.822561946509595, 0.822334970491528,
0.822404634095793, 2.66875340820162, 1.40412743557514, 6.33377768022403
), X3 = c(0.764363881671609, 0.788288196346034, 0.79927498357549,
0.805446784334039, 0.810604881970155, 0.814634331592811, 0.817002594424753,
0.818129844752095, 0.818572101954132, 0.818630700031836, 3.06323952591121,
6.4477868357554, 11.4657041958038, 9.27821049066848)), class = "data.frame", row.names = c(NA,
-14L))
One can easily compute row-wise median using base R like so:
df$median <- sapply(
seq(nrow(df)),
function(i) df[i, 1:4] %>% unlist %>% median
)
Above I select columns manually with numeric range, but to satisfy the dplyr requirement you can use dplyr::select() to choose your columns:
df$median <- sapply(
df %>% nrow %>% seq,
function(i) df[i, ] %>%
dplyr::select(X1, X2) %>%
unlist %>% median
)
I like this method because you don't have to search for different functions to calculate anything.
For example, standard deviation:
df$sd <- sapply(
df %>% nrow %>% seq,
function(i) df[i, ] %>%
dplyr::select(X1, X2) %>%
unlist %>% sd
)

Pick a column to multiply with, contingent on value of other variables

I am still doing my first footsteps with R and found SO to be a great tool for learning more and finding answers to my questions. For this one i though did not manage to find any good solution here.
I have a dataframe that can be simplified to this structure:
set.seed(10)
df <- data.frame(v1 = rep(1:2, times=3),
v2 = c("A","B","B","A","B","A"),
v3 = sample(1:6),
xA_1 = sample(1:6),
xA_2 = sample(1:6),
xB_1 = sample(1:6), xB_2 = sample(1:6))
df thus looks like this:
> df
v1 v2 v3 xA_1 xA_2 xB_1 xB_2
1 1 A 4 2 1 3 3
2 2 B 2 6 3 5 4
3 1 B 5 3 2 4 5
4 2 A 3 5 4 2 1
5 1 B 1 4 6 6 2
6 2 A 6 1 5 1 6
I now want R to create a fourth variable, which is dependent on the values of v1 and v2. I achieve this by using the following code:
df <- data.table(df)
df[, v4 := ifelse(v1 == 1 & v2 == "A", v3*xA_1,
ifelse(v1 == 1 & v2 == "B", v3*xB_1,
ifelse(v1 == 2 & v2 == "A", v3*xA_2,
ifelse(v1 == 2 & v2 == "B", v3*xB_2, v3*1))))]
So v4 is created by multiplying v3 with the column that contains the v1 and the v2 value
(e.g. for row 1: v1=1 and v2=A thus multiply v3=4 with xA_1=2 -> 8).
> df$v4
[1] 8 8 20 12 6 30
Obviuosly, my ifelse approach is tedious when v1 and v2 in fact have many more different values than they have in this example. So I am looking for an efficient way to tell R if v1 == y & v2 == z, multiply v3 with column xy_z.
I tried writing a for-loop, writing a function that has y and z as index and using the apply function. However none of this worked as wanted.
I appreciate any ideas!

Here's a base R option:
i <- paste0("x", df$v2, "_", df$v1)
df$v4 <- df$v3 * as.numeric(df[cbind(1:nrow(df), match(i, names(df)))])
For the sample data provided below, it creates a column v4 as:
> df$v4
[1] 25 12 2 6 3 10
Or if you want to include the "else" condition to multiply by 1 in case there's no matching column name:
i <- paste0("x", df$v2, "_", df$v1)
tmp <- as.numeric(df[cbind(1:nrow(df), match(i, names(df)))])
df$v4 <- df$v3 * ifelse(is.na(tmp), 1, tmp)
Sample data:
df <- structure(list(v1 = c(1L, 2L, 1L, 2L, 1L, 2L), v2 = structure(c(1L,
2L, 2L, 1L, 2L, 1L), .Label = c("A", "B"), class = "factor"),
v3 = c(5L, 4L, 1L, 6L, 3L, 2L), xA_1 = c(5L, 6L, 3L, 1L,
2L, 4L), xA_2 = c(6L, 4L, 2L, 1L, 3L, 5L), xB_1 = c(4L, 6L,
2L, 5L, 1L, 3L), xB_2 = c(5L, 3L, 2L, 4L, 1L, 6L)), .Names = c("v1",
"v2", "v3", "xA_1", "xA_2", "xB_1", "xB_2"), row.names = c(NA,
-6L), class = "data.frame")

This is a standard "wide" table problem - what you want is harder to do as-is, but easy when the data is "melted":
dt = as.data.table(df)
melt(dt, id.vars = c('v1', 'v2', 'v3'))[variable == paste0('x', v2, '_', v1)
][dt, on = c('v1', 'v2', 'v3'), v3 * value]
#[1] 8 8 20 12 6 30

You can try this :
v4 <- c()
for(i in 1:nrow(df)){
col <- paste("x",df$v2[i],"_",df$v1[i],sep="")
v4 <- c(v4,df$v3[i]*df[i,col])
}
df$v4 <- v4

delete the rows with duplicated ids

I want to delete the rows with duplicated ids
data
id V1 V2
1 a 1
1 b 2
2 a 2
2 c 3
3 a 4
The problem is that some people did the test for a few times, which generate multiple scores on V2, I want to delete the duplicated id and retain one of the scores in V2 randomly.
output
id V1 V2
1 a 1
2 a 2
3 a 4
I tried this:
neu <- unique(neu$userid)
but it didn't work

Using dplyr:
library(dplyr)
set.seed(1)
df %>% sample_frac(., 1) %>% arrange(id) %>% distinct(id)
Output:
id V1 V2
1 1 b 2
2 2 c 3
3 3 a 4
Data:
df <- structure(list(id = c(1L, 1L, 2L, 2L, 3L), V1 = structure(c(1L,
2L, 1L, 3L, 1L), .Label = c("a", "b", "c"), class = "factor"),
V2 = c(1L, 2L, 2L, 3L, 4L)), .Names = c("id", "V1", "V2"), class = "data.frame", row.names = c(NA,
-5L))

Creating the data frame based on your example:
df <- read.table(text =
"id V1 V2
1 a 1
1 b 2
2 a 2
2 c 3
3 a 4", h = T)
Since you want to remove rows randomly, first sort the rows of your data frame randomly:
df <- df[sample(nrow(df)),]
Then remove duplicates in the order of appearence:
df <- df[!duplicated(df$id),]
Now sort your data frame back:
df <- df[with(df, order(id)),]
Remember to change df by your data frame name.

Convert factors to numbers in a categorical data quickly

I have a big categorical data frame like
col1 col2 col3
abcd rweff 3433534
gfds erwq trdfs
abcd erwq trdfs
abcd rweff 3433534
......
I want to replace all these complicated categories to simple numbers, something like this
col1 col2 col3
1 2 1
2 1 2
1 1 2
1 2 1
......
How do I quickly achieve it in R?

Assuming that the columns are of 'factor' class
df1[] <- lapply(df1, as.numeric)
df1
# col1 col2 col3
#1 1 2 1
#2 2 1 2
#3 1 1 2
#4 1 2 1
If the columns are 'character' class, then convert to 'factor' and use as.numeric
df1[] <- lapply(df1, function(x) as.numeric(factor(x)))
These are similar options using dplyr or data.table. It may be faster (haven't benchmarked)
library(dplyr)
df1 <- mutate_each(df1, funs(as.numeric(.)))
If you use %<>% from magrittr, can avoid assigning to a new object or the existing one.
library(magrittr)
df1 %<>%
mutate_each(funs(as.numeric(.)))
Or
library(data.table)
setDT(df1)[, lapply(.SD, as.numeric)]
Or a bit more efficient method with set as it modifies columns by reference and the overhead of [.data.table is avoided
setDT(df1)
for(j in 1:ncol(df1)){
set(df1, i=NULL, j=j, value= as.numeric(df1[[j]]))
}
data
df1 <- structure(list(col1 = structure(c(1L, 2L, 1L, 1L),
.Label = c("abcd",
"gfds"), class = "factor"), col2 = structure(c(2L, 1L, 1L, 2L
), .Label = c("erwq", "rweff"), class = "factor"),
col3 = structure(c(1L,
2L, 2L, 1L), .Label = c("3433534", "trdfs"), class = "factor")),
.Names = c("col1",
"col2", "col3"), row.names = c(NA, -4L), class = "data.frame")