DocumentTermMatrix in R - sum of unique words for each row

DocumentTermMatrix in R - sum of unique words for each row - r

I have a DocumentTermMatrix data_tags with 80.000 rows (groups of tags)
and 900.000 columns, so 900.000 different tags.
Through findFreqTerms(data_tags,2) I found out that about 462.000 tags are unique.
I want to make a function where 2 things happen:
- delete these 462.000 columns, so that only tags with frequency 2 or more are left;
- create 1 new column (Uniques): sum() for each row of all the unique tags that were removed.
tag1 tag2 tag3 tag4
1 0 0 1 0
2 0 1 0 0
2 1 0 0 0
3 1 0 0 0
4 0 1 0 1
5 1 0 0 0
6 0 1 0 0
for example, tag 3 and tag4 are unqiue (only once appears in column):
tag1 tag2 Uniques
1 0 0 1
2 0 1 0
2 1 0 0
3 1 0 0
4 0 1 1
5 1 0 0
6 0 1 0
Thanks in advance for the help.

Maybe the following work for you.
library(slam)
library(tm)
set.seed(0)
terms <- sapply(LETTERS, function(letter) paste(rep.int(letter, 5), collapse = ""))
ndocs <- 5
doc_lengts <- sample(5:10, ndocs, TRUE)
docs <- lapply(doc_lengts, function(doc_len) sample(terms, doc_len, TRUE))
dtm <- DocumentTermMatrix(Corpus(VectorSource(docs)))
as.matrix(dtm)
## delete coloms so that only terms with frequency >= 2 are left
## here the function col_sums from the slam package helps
b <- col_sums(dtm) >= 2
dtm_deleted <- dtm[,!b]
dtm <- dtm[,b]
as.matrix(dtm)
## Uniques columns
as.matrix(dtm_deleted)
row_sums(dtm_deleted > 0)
dtm_new <- cbind(dtm, Uniques = row_sums(dtm_deleted > 0))
colnames(dtm_new)[ncol(dtm_new)] <- "Uniques"
as.matrix(dtm_new)

Related

Loop for Correlation Item-Score without containing Item

I have a huge dataset and I want to compute the correlation of each item with the total score of the scale, but without containing the item. Now I could do it separately for each item, but I am trying to do a loop, so that it is a bit easier.
Example dataset:
dat <- read.table(header=TRUE, text="
ItemX1 ItemX2 ItemX3 ItemX4 ItemX5 ItemX6 ItemY1 ItemY2 ItemY3 ItemY4 ItemY5 ItemY6
1 1 0 1 0 1 1 1 0 1 0 1
0 1 0 0 0 1 0 1 0 0 0 1
1 1 0 1 0 1 1 1 0 1 0 0
1 0 1 0 0 1 1 0 1 0 0 1
1 1 0 1 1 1 1 1 0 1 1 0
0 0 1 1 0 0 0 0 1 1 0 0
")
xscore <- rowSums(select(dat, starts_with("ItemX")))
Now I could do it like the following, but as I have 107 Items it is a bit much.
cor(dat$ItemX1,rowSums(select(dat, starts_with("ItemX") & -"ItemX1")),use="pairwise.complete.obs")
cor(dat$ItemX2,rowSums(select(dat, starts_with("ItemX") & -"ItemX2")),use="pairwise.complete.obs")
cor(dat$ItemX3,rowSums(select(dat, starts_with("ItemX") & -"ItemX3")),use="pairwise.complete.obs")
cor(dat$ItemX4,rowSums(select(dat, starts_with("ItemX") & -"ItemX4")),use="pairwise.complete.obs")
cor(dat$ItemX5,rowSums(select(dat, starts_with("ItemX") & -"ItemX5")),use="pairwise.complete.obs")
cor(dat$ItemX6,rowSums(select(dat, starts_with("ItemX") & -"ItemX6")),use="pairwise.complete.obs")
That's why I'm trying out the following loop, but now I don't know how to specify that the rowSums is calculated without the item which is in use for the correlation.
variables <- names(dat)
names.item <- c(grep("ItemX", variables, value = TRUE))
item.diff.p <- data.frame(matrix(NA, ncol=2, nrow=(length(names.item)-1)))
names(item.diff.p) <- c("Item", "cor")
length(names.item)
for(i in 1:(length(names.item))-1){
item <- names.item[i]
par <- cor(dat[,names(dat)[grepl("ItemX",names(dat))]],
rowSums(select(dat, starts_with("ItemX"))),use="pairwise.complete.obs")
item.diff.p[i, c("cor")]
}
par
Thank you all!

You can iterate through the columns of a subsetted dataframe, and calculate:
X_dat = dat[,grep("^ItemX",colnames(dat))]
res = sapply(1:ncol(X_dat),function(i){
cor(X_dat[,i],rowSums(X_dat[,-i]),use="p")
})
names(res) = colnames(X_dat)
res
ItemX1 ItemX2 ItemX3 ItemX4 ItemX5 ItemX6
0.6324555 0.1250000 -0.7500000 0.1250000 0.4152274 0.2335497

Extract common values pairs for multiple dataframes to create a new binary dataframe based on them

I have 3 dataframes
Drug<-c("ab","bc","cd","ef","gh")
Target<-c("qwewr","saff","cxzcc","sadda","sadd")
fileA<-data.frame(Drug,Target)
Drug<-c("ab","bc","cdD","efc","ghg","hj")
Target<-c("qwewr","saff","cxzccf","saddav","sadd","bn")
fileB<-data.frame(Drug,Target)
Drug<-c("abB","bcv","cdD","efc")
Target<-c("qwewrm","saff","cxzccfh","saddav")
fileC<-data.frame(Drug,Target)
As you can see each one contains a pair "Drug"-"Target". Every dataframe contains only unique pairs. But you can find exactly the same pair in the other dataframes. What I want to achieve is to create a new dataframe which will extract all the unique pairs in the first column and then in the other 3 columns will have the fileA, fileB and fileC which will be filled with 1 if the pair exists and 0 if the pair does not exist. Something like:
Pairs fileA fileB fileC
1: abqwewr 1 1 1
2: bcsaff 1 1 1
3: cdcxzcc 1 1 1
4: efsadda 1 1 1
5: ghsadd 1 1 0
6: cdDcxzccf 0 0 0
7: efcsaddav 0 0 0
8: ghgsadd 0 0 0
9: hjbn 0 0 0
10: abBqwewrm 0 0 0
11: bcvsaff 0 0 0
12: cdDcxzccfh 0 0 0
But here the dataframe is not correct since in the first column there is only the drug name and also each row should have had at least one 1.
My method:
# Create composite dataset by combining all files
compositeDataD <- rbind(fileA,fileB,fileC)
# Get unique (drug, target) pairs
# Connect Drug Names and Target Gene Symbols into one vector of pairs
compositeDataD <- na.omit(compositeDataD)
DrugTargetPairsD <- paste(compositeDataD$Drug,compositeDataD$Target,sep="")
uniquePairsD<-unique(DrugTargetPairsD)
PairsA <- DrugTargetPairsD[1:nrow(na.omit(fileA))]
PairsB <- DrugTargetPairsD[1:nrow(na.omit(fileB))]
PairsC <- DrugTargetPairsD[1:nrow(na.omit(fileC))]
# Create binary matrix for unique (drug, target) pairs
binaryA <- as.numeric(uniquePairsD %in% PairsA) # This function returns a binary value for each unique (Drug, Target) Pair compared with the content of file1
binaryB <- as.numeric(uniquePairsD %in% PairsB)
binaryC <- as.numeric(uniquePairsD %in% PairsC)
table33 <- data.table(Pairs=uniquePairsD,
fileA=binaryA,fileB=binaryB,
fileC=binaryC)

Form list L from the three objects and use lapply to paste their columns together and then stack to create a 2 column data frame with the pasted values and an indicator of which object it came from. Finally use table to provide the counts.
L <- mget(ls(pattern = "file"))
s <- stack(lapply(L, function(x) paste0(x[[1]], x[[2]])))
table(s)
giving:
ind
values fileA fileB fileC
abBqwewrm 0 0 1
abqwewr 1 1 0
bcsaff 1 1 0
bcvsaff 0 0 1
cdcxzcc 1 0 0
cdDcxzccf 0 1 0
cdDcxzccfh 0 0 1
efcsaddav 0 1 1
efsadda 1 0 0
ghgsadd 0 1 0
ghsadd 1 0 0
hjbn 0 1 0
A variation of this is to express it as this pipeline:
library(magrittr)
mget(ls(pattern = "file")) %>%
lapply(function(x) paste0(x[[1]], x[[2]])) %>%
stack %>%
table

You can first create the Pairs and then merge on them, while carrying a column where the data came from:
Create the indicator column in each file:
fileA$fileA <- 1
fileB$fileB <- 1
fileC$fileC <- 1
Create the pairs in each file:
fileA$DrugTargetPair <- paste0(fileA$Drug, fileA$Target)
fileB$DrugTargetPair <- paste0(fileB$Drug, fileB$Target)
fileC$DrugTargetPair <- paste0(fileC$Drug, fileC$Target)
Select only the indicator column and the Pairs colum :
fileA <- fileA[, c("DrugTargetPair", "fileA")]
fileB <- fileB[, c("DrugTargetPair", "fileB")]
fileC <- fileC[, c("DrugTargetPair", "fileC")]
Merge on the Pairs column, kepp all Pairs with all = T:
file_new <- merge(fileA, fileB, by = "DrugTargetPair", all = T)
file_new <- merge(file_new, fileC, by = "DrugTargetPair", all = T)
file_new[is.na(file_new)] <- 0
file_new
DrugTargetPair fileA fileB fileC
1 abBqwewrm 0 0 1
2 abqwewr 1 1 0
3 bcsaff 1 1 0
4 bcvsaff 0 0 1
5 cdcxzcc 1 0 0
6 cdDcxzccf 0 1 0
7 cdDcxzccfh 0 0 1
8 efcsaddav 0 1 1
9 efsadda 1 0 0
10 ghgsadd 0 1 0
11 ghsadd 1 0 0
12 hjbn 0 1 0

data:
Drug<-c("ab","bc","cd","ef","gh")
Target<-c("qwewr","saff","cxzcc","sadda","sadd")
fileA<-data.frame(I(Drug),I(Target))
Drug<-c("ab","bc","cdD","efc","ghg","hj")
Target<-c("qwewr","saff","cxzccf","saddav","sadd","bn")
fileB<-data.frame(I(Drug),I(Target))
Drug<-c("abB","bcv","cdD","efc")
Target<-c("qwewrm","saff","cxzccfh","saddav")
fileC<-data.frame(I(Drug),I(Target))
code:
all_list <- list(fileA, fileB, fileC)
all1 <- rbind(fileA,fileB,fileC)
all1 <- as.data.frame(unique(all1))
ans <- t(apply(all1, 1, function(drgT){ sapply(all_list, function(x) {(list(drgT) %in% unlist(apply(x,1,list), recursive = F))*1} ) }))
ans[rowSums(ans) == 1,] <- 0
cbind(all1, ans)
result:
# Drug Target 1 2 3
#1 ab qwewr 1 1 0
#2 bc saff 1 1 0
#3 cd cxzcc 0 0 0
#4 ef sadda 0 0 0
#5 gh sadd 0 0 0
#8 cdD cxzccf 0 0 0
#9 efc saddav 0 1 1
#10 ghg sadd 0 0 0
#11 hj bn 0 0 0
#12 abB qwewrm 0 0 0
#13 bcv saff 0 0 0
#14 cdD cxzccfh 0 0 0
please note:
please revise your example data/ desired outcome.
please E D U C A T E yourself on stringsAsFactors.

Sub-setting or arrange the data in R

As I am new to R, this question may seem to you piece of a cake.
I have a data in txt format. The first column has Cluster Number and the second column has names of different organisms.
For example:
0 org4|gene759
1 org1|gene992
2 org1|gene1101
3 org4|gene757
4 org1|gene1702
5 org1|gene989
6 org1|gene990
7 org1|gene1699
9 org1|gene1102
10 org4|gene2439
10 org1|gene1374
I need to re-arrange/reshape the data in following format.
Cluster No. Org 1 Org 2 org3 org4
0 0 0 1
1 0 0 0
I could not figure out how to do it in R.
Thanks

We could use table
out <- cbind(ClusterNo = seq_len(nrow(df1)), as.data.frame.matrix(table(seq_len(nrow(df1)),
factor(sub("\\|.*", "", df1[[2]]), levels = paste0("org", 1:4)))))
head(out, 2)
# ClusterNo org1 org2 org3 org4
#1 1 0 0 0 1
#2 2 1 0 0 0
It is also possible that we need to use the first column to get the frequency
out1 <- as.data.frame.matrix(table(df1[[1]],
factor(sub("\\|.*", "", df1[[2]]), levels = paste0("org", 1:4))))

Reading the table into R can be done with
input <- read.table('filename.txt')
Then we can extract the relevant number from the org4|gene759 string using a regular expression, and set this to a third column of our input:
input[, 3] <- gsub('^org(.+)\\|.*', '\\1', input[, 2])
Our input data now looks like this:
> input
V1 V2 V3
1 0 org4|gene759 4
2 1 org1|gene992 1
3 2 org1|gene1101 1
4 3 org4|gene757 4
5 4 org1|gene1702 1
6 5 org1|gene989 1
7 6 org1|gene990 1
8 7 org1|gene1699 1
9 9 org1|gene1102 1
10 10 org4|gene2439 4
11 10 org1|gene1374 1
Then we need to list the possible values of org:
possibleOrgs <- seq_len(max(input[, 3])) # = c(1, 2, 3, 4)
Now for the tricky part. The following function takes each unique cluster number in turn (I notice that 10 appears twice in your example data), takes all the rows relating to that cluster, and looks at the org value for those rows.
result <- vapply(unique(input[, 1]), function (x)
possibleOrgs %in% input[input[, 1] == x, 3], logical(4)))
We can then format this result as we like, perhaps using t to transform its orientation, * 1 to convert from TRUEs and FALSEs to 1s and 0s, and colnames to title its columns:
result <- t(result) * 1
colnames (result) <- paste0('org', possibleOrgs)
rownames(result) <- unique(input[, 1])
I hope that this is what you were looking for -- it wasn't quite clear from your question!
Output:
> result
org1 org2 org3 org4
0 0 0 0 1
1 1 0 0 0
2 1 0 0 0
3 0 0 0 1
4 1 0 0 0
5 1 0 0 0
6 1 0 0 0
7 1 0 0 0
9 1 0 0 0
10 1 0 0 1

Assign value to new column if another column has exact string

I have a df that looks like this.
Date Winner
4/12 Tom
4/13 Abe
4/14 George
4/15 Tom
I would like to add new columns that assign a 1 if if the name appears in the winner column and 0 if the name did not appear and vice versa. Ideally the df would look like this as a result
Date Winner Tom_Win Tom_Lose Abe_Win Abe_Lose George_Win George Lose
4/12 Tom 1 0 0 1 0 1
4/13 Abe 0 1 1 0 0 1
4/14 George 0 1 0 1 1 0
4/15 Tom 1 0 0 1 0 1
Is there an easy way to accomplish this?

This is extremely simple to do if you use the model.matrix functions, it will create N dummy columns with 0 when the name does not appear and one when it does (exactly as you requested), the code below:
(assuming your data is called db)
> winners <- model.matrix(~Winner - 1, data=db)
> winners
WinnerAbe WinnerGeorge WinnerTom
1 0 0 1
2 1 0 0
3 0 1 0
4 0 0 1
This bit is to compute the columns with the losing values
winners <- as.data.frame(winners)
winners$loserAbe <- as.numeric(!winners$WinnerAbe) #naturally you have to
#do this for every column you need
WinnerAbe WinnerGeorge WinnerTom loserAbe
1 0 0 1 1
2 1 0 0 0
3 0 1 0 1
4 0 0 1 1
winners$Date <- db$Date #this last bit so you don't lose the date.

Using mtabulate from qdapTools package we can do the following three steps,
library(qdapTools)
d1 <- mtabulate(d3$Winner)
d2 <- setNames(data.frame(sapply(d1, function(i) ifelse(i == 1, 0, 1))),
paste0(names(d1), '_Lose'))
cbind(d3$Date, d1, d2)
# d3$Date Abe George Tom Abe_Lose George_Lose Tom_Lose
#1 4/12 0 0 1 1 1 0
#2 4/13 1 0 0 0 1 1
#3 4/14 0 1 0 1 0 1
#4 4/15 0 0 1 1 1 0
DATA
str(d3)
'data.frame': 4 obs. of 2 variables:
$ Date : Factor w/ 4 levels "4/12","4/13",..: 1 2 3 4
$ Winner: Factor w/ 3 levels "Abe","George",..: 3 1 2 3

I'm sure there is a better way than this but this works in base R and it's fairly simple:
If your data looks like this:
df <- data.frame(Date = c("4/12","4/13","4/14","4/15"),Winner = c("Tom","Abe","George","Tom"))
Append the extra columns like so:
xcols <- c(paste0(unique(df$Winner), '_Win'), paste0(unique(df$Winner), '_Lose'))
df[ , xcols] <- 0
Now make a character vector with instructions to give the points for every player.
evl <- unlist(lapply(unique(df$Winner), function(x){paste0('df[', which(df$Winner == x), ',', which(names(df) == paste0(x, '_Win')), '] <- 1')}))
And execute the code:
eval(parse(text = evl))

df <- data.frame(
Date = c("4/12", "4/13","4/14", "4/15"),
Winner = c("Tom", "Abe", "George", "Tom")
)
df2 <- do.call(cbind,
lapply(seq_along(levels(df$Winner)), function(x) {
win <- ifelse(df$Winner == levels(df$Winner)[x], 1, 0)
lose <- ifelse(df$Winner == levels(df$Winner)[x], 0, 1)
dat <- cbind(win, lose)
colnames(dat) <- c(paste(levels(df$Winner)[x], "win", sep = "_"), paste(levels(df$Winner)[x], "lose", sep = "_"))
dat
})
)
cbind(df, df2)
> cbind(df, df2)
Date Winner Abe_win Abe_lose George_win George_lose Tom_win Tom_lose
1 4/12 Tom 0 1 0 1 1 0
2 4/13 Abe 1 0 0 1 0 1
3 4/14 George 0 1 1 0 0 1
4 4/15 Tom 0 1 0 1 1 0

how to subset a data frame based on list of multiple match case in columns

So I have a list that contains certain characters as shown below
list <- c("MY","GM+" ,"TY","RS","LG")
And I have a variable named "CODE" in the data frame as follows
code <- c("MY GM+","","LGTY", "RS","TY")
df <- data.frame(1:5,code)
df
code
1 MY GM+
2
3 LGTY
4 RS
5 TY
Now I want to create 5 new variables named "MY","GM+","TY","RS","LG"
Which takes binary value, 1 if there's a match case in the CODE variable
df
code MY GM+ TY RS LG
1 MY GM+ 1 1 0 0 0
2 0 0 0 0 0
3 LGTY 0 0 1 0 1
4 RS 0 0 0 1 0
5 TY 0 0 1 0 0
Really appreciate your help. Thank you.

Since you know how many values will be returned (5), and what you want their types to be (integer), you could use vapply() with grepl(). We can turn the resulting logical matrix into integer values by using integer() in vapply()'s FUN.VALUE argument.
cbind(df, vapply(List, grepl, integer(nrow(df)), df$code, fixed = TRUE))
# code MY GM+ TY RS LG
# 1 MY GM+ 1 1 0 0 0
# 2 0 0 0 0 0
# 3 LGTY 0 0 1 0 1
# 4 RS 0 0 0 1 0
# 5 TY 0 0 1 0 0
I think your original data has a couple of typos, so here's what I used:
List <- c("MY", "GM+" , "TY", "RS", "LG")
df <- data.frame(code = c("MY GM+", "", "LGTY", "RS", "TY"))

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

DocumentTermMatrix in R - sum of unique words for each row - r

Related

Loop for Correlation Item-Score without containing Item

Extract common values pairs for multiple dataframes to create a new binary dataframe based on them

Sub-setting or arrange the data in R

Assign value to new column if another column has exact string

how to subset a data frame based on list of multiple match case in columns

Categories

Resources