How to match characters change in R - r

I have below-mentioned dataframe in R:
ID source_field_1 field_1 source_field_3 field_3
ER-1 AC45U CD34I 1992-01-23 23/01/1992
ER-2 AB15X 1971-01-23 23/1/1971
ER-3 DB22U AC22Z 1962-11-13 3/11/1962
ER-4 CF12R BA23D 1992-01-23 23/01/1992
I need a group by count of change of characters from column source_field_1 to field_1, from A to Z and from 0 to 9.
Required Output:
source_field_1 A B C D E . . . Z
A 1
B 1
C 1 1
D 1
E
F 1
.
. 1
. 1
Z
Need the same structure for numerical characters as well for both field_1 and field_3.

df1 <- na.omit(df)
create <- function(from,to,nm)
{
s <- sprintf("[^%s]",paste0(nm,collapse = ""))
from <- unlist(strsplit(gsub(s,"",from),""))
to <- unlist(strsplit(gsub(s,"",to),""))
table(from,to)
}
create(df1$source_field_1,df1$field_1,0:9)
to
from 2 3 4
1 1 0 0
2 2 1 0
4 0 1 0
5 0 0 1
create(df1$source_field_1,df1$field_1,LETTERS)
to
from A B C D I Z
A 0 0 1 0 0 0
B 0 0 1 0 0 0
C 0 1 0 1 0 0
D 1 0 0 0 0 0
F 1 0 0 0 0 0
R 0 0 0 1 0 0
U 0 0 0 0 1 1

This is rather simple to achieve by splitting up each character and using the table function.
library(stringr)
df <- [your df]
out <- vector('list', nrow(df))
for(i in seq_along(out)){
#Split both columns
splitted_str <- str_split(unlist(df[i, c('source_field_1', 'field_1')]), '')
#Alternative in base R:
#gsub(LETTERS, '', unlist(df[i, c('source_field_1', 'field_1')]))
#convert to factors, "levels" will be used in our columns
splitted_str <- lapply(splitted_str, factor, levels = LETTERS)
#Create table. dnn sets the names shown for column/rows
out[[i]] <- table(splitted_str, dnn = c('source_field_1', 'field_1'))
}
note that i abuse the fact that factor(...) sets all values not in levels to NA, and by default table(...) excludes these in the table.
Obviously this could all be combined into a single line
out <- lapply(seq(nrow(df)),
function(x) table(lapply(str_split(unlist(df[i, c('source_field_1', 'field_1')]), ''), factor, levels = LETTERS), dnn = c('source_field_1', 'field_1'))
)

Related

Add X number of columns to a data.frame

I would like to add a varying number (X) of columns with 0 to an existing data.frame within a function.
Here is an example data.frame:
dt <- data.frame(x=1:3, y=4:6)
I would like to get this result if X=1 :
a x y
1 0 1 4
2 0 2 5
3 0 3 6
And this if X=3 :
a b c x y
1 0 0 0 1 4
2 0 0 0 2 5
3 0 0 0 3 6
What would be an efficient way to do this?
We can assign multiple columns to '0' based on the value of 'X'
X <- 3
nm1 <- names(dt)
dt[letters[seq_len(X)]] <- 0
dt[c(setdiff(names(dt), nm1), nm1)]
Also, we can use add_column from tibble and create columns at a specific location
library(tibble)
add_column(dt, .before = 1, !!!setNames(as.list(rep(0, X)),
letters[seq_len(X)]))
A second option is cbind
f <- function(x, n = 3) {
cbind.data.frame(matrix(
0,
ncol = n,
nrow = nrow(x),
dimnames = list(NULL, letters[1:n])
), x)
}
f(dt, 5)
# a b c d e x y
#1 0 0 0 0 0 1 4
#2 0 0 0 0 0 2 5
#3 0 0 0 0 0 3 6
NOTE: because letters has a length of 26 the function would need some adjustment regarding the naming scheme if n > 26.
You can try the code below
dt <- cbind(`colnames<-`(t(rep(0,X)),letters[seq(X)]),dt)
If you don't care the column names of added columns, you can use just
dt <- cbind(t(rep(0,X)),dt)
which is much shorter

Randomly insert 1's in columns of a data frame

I want to randomly insert 1's in the columns of a data frame that do not currently have 1 in them. Using different seeds for each of the variables.
Below is the code I have written so far:
# create the data frame
df <- data.frame(A = c(0,0,0,0,0,0,0,0,0,0),
B = c(0,0,0,0,0,0,0,0,0,0),
C = c(0,1,0,0,0,1,0,1,0,0),
D = c(0,0,0,0,0,0,0,0,0,0),
E = c(0,1,0,1,0,0,0,0,0,0))
# get index of columns that have 1's in them
one_index <- which(grepl(pattern = 1, df))
# function to randomly put 1's with seperate seeds
funcccs <- function(x){
i = 0
for (i in 1:ncol(x)) {
set.seed(i + 1)
x[sample(nrow(x),3)] <- 1
}}
# Apply the function to the columns that do not have 1
funcccs(df[,-one_index])
Below is the error message I get:
Error in [<-.data.frame (*tmp*, sample(nrow(x), 3), value = 1) :
new columns would leave holes after existing columns
Based on the above example, the function should randomly insert 3 values of 1 in variables 'A', 'B' and 'D', because these 3 variables do not currently have 1's in them.
Any help will be appreciated. Thanks
df <- data.frame(A = c(0,0,0,0,0,0,0,0,0,0),
B = c(0,0,0,0,0,0,0,0,0,0),
C = c(0,1,0,0,0,1,0,1,0,0),
D = c(0,0,0,0,0,0,0,0,0,0),
E = c(0,1,0,1,0,0,0,0,0,0))
one_index <- which(grepl(pattern = 1, df))
funcccs <- function(x){
i = 0
for (i in 1:ncol(x)) {
set.seed(i + 1)
x[sample(nrow(x),3),i]= 1
}
return(x)
}
df[,-one_index]=funcccs(df[,-one_index])
You where choosing the whole matrix insted of the i column.
> df
A B C D E
1 0 0 0 1 0
2 1 1 1 0 1
3 0 0 0 1 0
4 0 1 0 0 1
5 1 0 0 0 0
6 0 0 1 1 0
7 1 0 0 0 0
8 0 1 1 0 0
9 0 0 0 0 0
10 0 0 0 0 0

Extract common values pairs for multiple dataframes to create a new binary dataframe based on them

I have 3 dataframes
Drug<-c("ab","bc","cd","ef","gh")
Target<-c("qwewr","saff","cxzcc","sadda","sadd")
fileA<-data.frame(Drug,Target)
Drug<-c("ab","bc","cdD","efc","ghg","hj")
Target<-c("qwewr","saff","cxzccf","saddav","sadd","bn")
fileB<-data.frame(Drug,Target)
Drug<-c("abB","bcv","cdD","efc")
Target<-c("qwewrm","saff","cxzccfh","saddav")
fileC<-data.frame(Drug,Target)
As you can see each one contains a pair "Drug"-"Target". Every dataframe contains only unique pairs. But you can find exactly the same pair in the other dataframes. What I want to achieve is to create a new dataframe which will extract all the unique pairs in the first column and then in the other 3 columns will have the fileA, fileB and fileC which will be filled with 1 if the pair exists and 0 if the pair does not exist. Something like:
Pairs fileA fileB fileC
1: abqwewr 1 1 1
2: bcsaff 1 1 1
3: cdcxzcc 1 1 1
4: efsadda 1 1 1
5: ghsadd 1 1 0
6: cdDcxzccf 0 0 0
7: efcsaddav 0 0 0
8: ghgsadd 0 0 0
9: hjbn 0 0 0
10: abBqwewrm 0 0 0
11: bcvsaff 0 0 0
12: cdDcxzccfh 0 0 0
But here the dataframe is not correct since in the first column there is only the drug name and also each row should have had at least one 1.
My method:
# Create composite dataset by combining all files
compositeDataD <- rbind(fileA,fileB,fileC)
# Get unique (drug, target) pairs
# Connect Drug Names and Target Gene Symbols into one vector of pairs
compositeDataD <- na.omit(compositeDataD)
DrugTargetPairsD <- paste(compositeDataD$Drug,compositeDataD$Target,sep="")
uniquePairsD<-unique(DrugTargetPairsD)
PairsA <- DrugTargetPairsD[1:nrow(na.omit(fileA))]
PairsB <- DrugTargetPairsD[1:nrow(na.omit(fileB))]
PairsC <- DrugTargetPairsD[1:nrow(na.omit(fileC))]
# Create binary matrix for unique (drug, target) pairs
binaryA <- as.numeric(uniquePairsD %in% PairsA) # This function returns a binary value for each unique (Drug, Target) Pair compared with the content of file1
binaryB <- as.numeric(uniquePairsD %in% PairsB)
binaryC <- as.numeric(uniquePairsD %in% PairsC)
table33 <- data.table(Pairs=uniquePairsD,
fileA=binaryA,fileB=binaryB,
fileC=binaryC)
Form list L from the three objects and use lapply to paste their columns together and then stack to create a 2 column data frame with the pasted values and an indicator of which object it came from. Finally use table to provide the counts.
L <- mget(ls(pattern = "file"))
s <- stack(lapply(L, function(x) paste0(x[[1]], x[[2]])))
table(s)
giving:
ind
values fileA fileB fileC
abBqwewrm 0 0 1
abqwewr 1 1 0
bcsaff 1 1 0
bcvsaff 0 0 1
cdcxzcc 1 0 0
cdDcxzccf 0 1 0
cdDcxzccfh 0 0 1
efcsaddav 0 1 1
efsadda 1 0 0
ghgsadd 0 1 0
ghsadd 1 0 0
hjbn 0 1 0
A variation of this is to express it as this pipeline:
library(magrittr)
mget(ls(pattern = "file")) %>%
lapply(function(x) paste0(x[[1]], x[[2]])) %>%
stack %>%
table
You can first create the Pairs and then merge on them, while carrying a column where the data came from:
Create the indicator column in each file:
fileA$fileA <- 1
fileB$fileB <- 1
fileC$fileC <- 1
Create the pairs in each file:
fileA$DrugTargetPair <- paste0(fileA$Drug, fileA$Target)
fileB$DrugTargetPair <- paste0(fileB$Drug, fileB$Target)
fileC$DrugTargetPair <- paste0(fileC$Drug, fileC$Target)
Select only the indicator column and the Pairs colum :
fileA <- fileA[, c("DrugTargetPair", "fileA")]
fileB <- fileB[, c("DrugTargetPair", "fileB")]
fileC <- fileC[, c("DrugTargetPair", "fileC")]
Merge on the Pairs column, kepp all Pairs with all = T:
file_new <- merge(fileA, fileB, by = "DrugTargetPair", all = T)
file_new <- merge(file_new, fileC, by = "DrugTargetPair", all = T)
file_new[is.na(file_new)] <- 0
file_new
DrugTargetPair fileA fileB fileC
1 abBqwewrm 0 0 1
2 abqwewr 1 1 0
3 bcsaff 1 1 0
4 bcvsaff 0 0 1
5 cdcxzcc 1 0 0
6 cdDcxzccf 0 1 0
7 cdDcxzccfh 0 0 1
8 efcsaddav 0 1 1
9 efsadda 1 0 0
10 ghgsadd 0 1 0
11 ghsadd 1 0 0
12 hjbn 0 1 0
data:
Drug<-c("ab","bc","cd","ef","gh")
Target<-c("qwewr","saff","cxzcc","sadda","sadd")
fileA<-data.frame(I(Drug),I(Target))
Drug<-c("ab","bc","cdD","efc","ghg","hj")
Target<-c("qwewr","saff","cxzccf","saddav","sadd","bn")
fileB<-data.frame(I(Drug),I(Target))
Drug<-c("abB","bcv","cdD","efc")
Target<-c("qwewrm","saff","cxzccfh","saddav")
fileC<-data.frame(I(Drug),I(Target))
code:
all_list <- list(fileA, fileB, fileC)
all1 <- rbind(fileA,fileB,fileC)
all1 <- as.data.frame(unique(all1))
ans <- t(apply(all1, 1, function(drgT){ sapply(all_list, function(x) {(list(drgT) %in% unlist(apply(x,1,list), recursive = F))*1} ) }))
ans[rowSums(ans) == 1,] <- 0
cbind(all1, ans)
result:
# Drug Target 1 2 3
#1 ab qwewr 1 1 0
#2 bc saff 1 1 0
#3 cd cxzcc 0 0 0
#4 ef sadda 0 0 0
#5 gh sadd 0 0 0
#8 cdD cxzccf 0 0 0
#9 efc saddav 0 1 1
#10 ghg sadd 0 0 0
#11 hj bn 0 0 0
#12 abB qwewrm 0 0 0
#13 bcv saff 0 0 0
#14 cdD cxzccfh 0 0 0
please note:
please revise your example data/ desired outcome.
please E D U C A T E yourself on stringsAsFactors.

extracting maximum value of cumulative sum into a new column

A sample of data set:
testdf <- data.frame(risk_11111 = c(0,0,1,2,3,0,1,2,3,4,0), risk_11112 = c(0,0,1,2,3,0,1,2,0,1,0))
And I need output data set which would contain new column where only maximum values of cumulative sum will be maintained:
testdf <- data.frame(risk_11111 = c(0,0,1,2,3,0,1,2,3,4,0),
risk_11111_max = c(0,0,0,0,3,0,0,0,0,4,0),
risk_11112 = c(0,0,1,2,3,0,1,2,0,1,0),
risk_11112_max = c(0,0,0,0,3,0,0,2,0,1,0))
I am guessing some logical subseting of vectors colwise with apply and extracting max value with position index, and mutate into new variables.
I dont know how to extract values for new variable.
Thanks
Something like this with base R:
lapply(testdf, function(x) {
x[diff(x) > 0] <- 0
x
})
And to have all in one data.frame:
dfout <- cbind(testdf, lapply(testdf, function(x) {
x[diff(x) > 0] <- 0
x
}))
names(dfout) <- c(names(testdf), 'risk_1111_max', 'risk_1112_max')
Output:
risk_11111 risk_11112 risk_1111_max risk_1112_max
1 0 0 0 0
2 0 0 0 0
3 1 1 0 0
4 2 2 0 0
5 3 3 3 3
6 0 0 0 0
7 1 1 0 0
8 2 2 0 2
9 3 0 0 0
10 4 1 4 1
11 0 0 0 0

how to subset a data frame based on list of multiple match case in columns

So I have a list that contains certain characters as shown below
list <- c("MY","GM+" ,"TY","RS","LG")
And I have a variable named "CODE" in the data frame as follows
code <- c("MY GM+","","LGTY", "RS","TY")
df <- data.frame(1:5,code)
df
code
1 MY GM+
2
3 LGTY
4 RS
5 TY
Now I want to create 5 new variables named "MY","GM+","TY","RS","LG"
Which takes binary value, 1 if there's a match case in the CODE variable
df
code MY GM+ TY RS LG
1 MY GM+ 1 1 0 0 0
2 0 0 0 0 0
3 LGTY 0 0 1 0 1
4 RS 0 0 0 1 0
5 TY 0 0 1 0 0
Really appreciate your help. Thank you.
Since you know how many values will be returned (5), and what you want their types to be (integer), you could use vapply() with grepl(). We can turn the resulting logical matrix into integer values by using integer() in vapply()'s FUN.VALUE argument.
cbind(df, vapply(List, grepl, integer(nrow(df)), df$code, fixed = TRUE))
# code MY GM+ TY RS LG
# 1 MY GM+ 1 1 0 0 0
# 2 0 0 0 0 0
# 3 LGTY 0 0 1 0 1
# 4 RS 0 0 0 1 0
# 5 TY 0 0 1 0 0
I think your original data has a couple of typos, so here's what I used:
List <- c("MY", "GM+" , "TY", "RS", "LG")
df <- data.frame(code = c("MY GM+", "", "LGTY", "RS", "TY"))

Resources