Let's assume that I have a data table with People who watched Movies, like
library(data.table)
DT = fread("
User, Movie
Alice , Fight Club
Alice, The Godfather
Bob, Titanic
Charlotte, The Godfather")
I want to compute, for each pair of movies, the number of people who watched both and the number of people who watched at least one, i.e.
Movie1 Movie2 WatchedOne WatchedBoth
Fight Club The Godfather 2 1
The Godfather Titanic 3 0
Fight Club Titanic 2 0
I have millions of rows and I would need a blazingly fast data.table function :-)
Thanks for help!
Another way:
DT = DT[, .(Users = list(User)), keyby='Movie']
Y = data.table(t(combn(DT$Movie, 2)))
setnames(Y, c('Movie1','Movie2'))
Y[DT, on=.(Movie1==Movie), Movie1.Users:= Users]
Y[DT, on=.(Movie2==Movie), Movie2.Users:= Users]
#Y[, WatchedOne:= lengths(Map(union, Movie1.Users, Movie2.Users))]
Y[, WatchedBoth:= lengths(Map(intersect, Movie1.Users, Movie2.Users))]
# better:
Y[, WatchedOne:= lengths(Movie1.Users) + lengths(Movie2.Users) - WatchedBoth]
> Y[, -(3:4)]
# Movie1 Movie2 WatchedBoth WatchedOne
# 1: Fight Club The Godfather 1 2
# 2: Fight Club Titanic 0 2
# 3: The Godfather Titanic 0 3
This achieves what you are after
library(data.table)
mydt <- data.table(User = c("Alice", "Alice", "Bob", "Charlotte"),
Movie = c("Fight Club", "The Godfather", "Titanic", "The Godfather"))
##
mydt2 <- data.table(t(mydt[,combn(unique(Movie), 2, simplify = FALSE)]))
names(mydt2) <- c("Movie1", "Movie2")
##
temp <- apply(mydt2, 1, function(x) mydt[Movie %in% x, .N, by = User])
mydt2[, WatchedOne := lapply(temp, function(x) x[, length(N)])]
mydt2[, WatchedBoth := lapply(temp, function(x) x[, sum(N==2)])]
# Movie1 Movie2 WatchedOne WatchedBoth
# 1: Fight Club The Godfather 2 1
# 2: Fight Club Titanic 2 0
# 3: The Godfather Titanic 3 0
#sirallen #simone
Thank you for your answers, I tried both ways.
However, I found the fastest way to be the following:
DT_comb <- as.data.table( t( combn( movie, 2) ) )
colnames(DT_comb) <- c("movie1", "movie2")
function_1 <- function(movie_i, movie_j){
ur_i = DT[movie == movie_i, user_ID]
ur_j = DT[movie == movie_j, user_ID]
x = length(intersect(ur_i, ur_j))
return(x)
}
function_2 <- function(movie_i, movie_j){
ur_i = DT[movie == movie_i, user_ID]
ur_j = DT[movie == movie_j, user_ID]
x = length(union(ur_i, ur_j))
return(x)
}
cl <- makeCluster(detectCores() - 1)
clusterExport(cl=cl, varlist=c("DT", "function_1", "function_2"))
clusterCall(cl, function() library(data.table))
DT_comb$Watched_One <- clusterMap(cl,
function_1,
DT_corr$movie1,
DT_corr$movie2)
DT_comb$Watched_Both <- clusterMap(cl,
function_2,
DT_corr$movie1,
DT_corr$movie2)
stopCluster(cl)
Maybe your solutions are even faster than mine when parallelized? :-)
Related
Here is my sample data:
a <- data.frame(name = c('Ace CO', 'Bayes', 'aasd', 'Apple', 'Orange', 'Banana',
'Ace CO', 'Bayes', 'aasd', 'Apple', 'Orange', 'Banana'),
date=c(1991,1991,1991,1991,1991,1991,
1992,1992,1992,1992,1992,1992),
price = c(10, 13, 2, 1, 15, 1,
11,15,3,2,14,4))
b <- data.frame(name = c('Ace Co.', 'Bayes INC.', 'asd',
'Ace Co.', 'Bayes INC.', 'asd'),
date=c(1991,1991,1991,1992,1992,1992),
qty = c(9, 99, 10,10,105,15))
I am left joining a to b by date and name, date is exact while name is fuzzy. I have tried stringdist_join but it only accomdates fuzzy merge.
The expected output is as follows:
c<- data.frame(name = c('Ace Co.', 'Bayes INC.', 'asd',
'Ace Co.', 'Bayes INC.', 'asd'),
date=c(1991,1991,1991,1992,1992,1992),
qty = c(9, 99, 10,10,105,15),
price = c(10, 13, 2,11,15,3))
I'd like to manipulate it under dplyr.
Using distance matrix to merge fuzzy strings
Main principle
Get the distance matrix between each unique terms of you vectors. Then, check what threshold might lead to the best results (this has to be human supervised I think).
Then, use this new correspondance table to merge your dataframes. Finallyyou can change names (i.e. adding "inc.") easier because you have "standardized" names.
With utils::adist()
I think stringdist is better because you can choose the method, but here is a base example as a suggestion on how to use this concept of distance to get the expected output.
# 1st create a matrix with the Standard Levenshtein distance between the name fields of both sources (or other method from stringdist)
dist_name_matrix <- adist(unique(a$name), unique(b$name), partial = TRUE, ignore.case = TRUE)
colnames(dist_name_matrix) <- unique(b$name)
rownames(dist_name_matrix) <- unique(a$name)
# lets convert this matrix to a dataframe for more visual changes, you will need to check it yourself
library(dplyr)
library(tidyr)
dist_df <- dist_name_matrix %>%
as.data.frame() %>%
tibble::rownames_to_column(., "a_name") %>%
pivot_longer(cols = 2:last_col(), names_to = "b_name", values_to = "dist") %>%
filter(dist < 2) # you might need to adapt this to your needs
# Now this can be used to merge your data i.e
a %>%
left_join(., dist_df, by = c("name" = "a_name")) %>%
right_join(., b, by = c("b_name" = "name", "date" = "date")) %>%
# added just to match your expected output
filter(!is.na(name)) %>%
select(b_name, date, qty, price)
Output:
b_name date qty price
1 Ace Co. 1991 9 10
2 Bayes INC. 1991 99 13
3 asd 1991 10 2
4 Ace Co. 1992 10 11
5 Bayes INC. 1992 105 15
6 asd 1992 15 3
Same process can be used with stringdist:
library(stringdist)
dist_name_matrix <- stringdistmatrix(unique(a$name), unique(b$name), method = "jw", useBytes = FALSE)
colnames(dist_name_matrix) <- unique(b$name)
rownames(dist_name_matrix) <- unique(a$name)
Then just adapt the threshold after human check i.e. filter(dist < 0.2)
agrep solution
The following function is almost surely not as general as it is supposed to be. But here it goes.
funMerge <- function(X, Y, col, col_approx, sep = "."){
other_cols.x <- setdiff(names(X), c(col, col_approx))
other_cols.y <- setdiff(names(Y), c(col, col_approx))
sp.x <- split(X, X[[col]])
sp.y <- split(Y, Y[[col]])
common_names <- intersect(names(sp.x), names(sp.y))
res <- sapply(common_names, function(sp.name){
x <- sp.x[[sp.name]]
y <- sp.y[[sp.name]]
k <- sapply(x[[col_approx]], agrep, y[[col_approx]])
k <- k[sapply(k, length) > 0]
k <- unlist(k)
i <- match(names(k), x[[col_approx]])
df_other.x <- x[k, other_cols.x, drop = FALSE]
df_other.y <- y[k, other_cols.y, drop = FALSE]
df_tmp <- data.frame(
x[k, col],
names(k),
y[k, col_approx]
)
names(df_tmp) <- c(col, col_approx, paste(col_approx, "y", sep = sep))
cbind(df_tmp, df_other.x, df_other.y)
}, simplify = FALSE)
res <- do.call(rbind, res)
row.names(res) <- NULL
res
}
funMerge(a, b, col = "date", col_approx = "name")
# date name name.y price qty
#1 1991 Ace Co Ace Co. 10 9
#2 1991 Bayes Bayes Inc. 13 99
#3 1991 asd asdf 2 10
#4 1992 Ace Co Ace CO. 11 10
#5 1992 Bayes Bayes INC. 15 105
#6 1992 asd aasdf 3 15
stringdist solution
The following function uses package stringdist to compute the Jaro-Winkler pairwise distances between the columns that need to be matched approximately.
From help('stringdist-metrics'), my emphasis.
The metric you need to choose for an application strongly depends on both the nature of the string (what does the string represent?) and the cause of dissimilarities between the strings you are measuring. For example, if you are comparing human-typed names that may contain typo's, the Jaro-Winkler distance may be of use. If you are comparing names that were written down after hearing them, a phonetic distance may be a better choice.
A more efficient algorithm would be to first split the data sets by the exact match column and then apply the method of funMerge2.
library(stringdist)
funMerge2 <- function(X, Y, col, col_approx, method = "jw", threshold = 0.2){
x <- X[[col_approx]]
y <- Y[[col_approx]]
d <- stringdistmatrix(x, y, method = method, useBytes = FALSE)
w <- which(d < threshold, arr.ind = TRUE)
Z1 <- X[w[, "row"], ]
Z2 <- Y[w[, "col"], ]
res <- cbind(Z1, Z2)
common_cols <- grep(col, names(res))
res <- res[apply(res[, common_cols], 1, function(x) x[1] == x[2]), ]
row.names(res) <- NULL
res
}
funMerge2(a, b, col = "date", col_approx = "name")
# name date price name date qty
#1 Ace Co 1991 10 Ace Co. 1991 9
#2 Bayes 1991 13 Bayes Inc. 1991 99
#3 asd 1991 2 asdf 1991 10
#4 Ace Co 1992 11 Ace CO. 1992 10
#5 Bayes 1992 15 Bayes INC. 1992 105
#6 asd 1992 3 aasdf 1992 15
I have a dataframe, parsed from Coursera. One of the columns is number of students enrolled on the course. Looks like this:
df <- data.frame(uni = c("Yale", "Toronto", "NYU"), students = c("16m", "240k", "7.5k"))
uni students
1 Yale "16m"
2 Toronto "240k"
3 NYU "7.5k"
What I need to get is
uni students
1 Yale 16000000
2 Toronto 240000
3 NYU 75000
So, the main difficulty for me there is that the class of values is character, and I do not know function for replacing ks and ms, and transforming the class of column to numerics.
Please, help me!
E.g.
d$students <- dplyr::case_when(
stringr::str_detect(d$students, 'm') ~ readr::parse_number(d$students) * 1e6,
stringr::str_detect(d$students, 'k') ~ readr::parse_number(d$students) * 1e3,
TRUE ~ parse_number(d$students)
)
An option with base r:
df$students <- ifelse(grepl('m', ignore.case = TRUE, df$students), as.numeric(gsub("[$m]", "", df$students)) * 10^6,
as.numeric(gsub("[$k]", "", df$students)) * 10^3)
# uni students
# 1 Yale 16000000
# 2 Toronto 240000
# 3 NYU 7500
Using stringr and dplyr from tidyverse
library(tidyverse)
df %>%
mutate(students = case_when(
str_detect(students, "m") ~ as.numeric(str_extract(students, "[\\d\\.]+")) * 1000000,
str_detect(students, "k") ~ as.numeric(str_extract(students, "[\\d\\.]+")) * 1000,
))
# A tibble: 3 x 2
uni students
<chr> <dbl>
1 Yale 16000000
2 Toronto 240000
3 NYU 7500
Here's an approach with separate that would work for any arbitrary number of modifiers, simply keep defining them in the case_when statement.
library(dplyr)
library(tidry)
df %>%
separate(students,into = c("value","modifier"),
sep = "(?<=[\\d])(?=[^\\d.])") %>%
mutate(modifier = case_when(modifier == "b" ~ 1000000000,
modifier == "m" ~ 1000000,
modifier == "k" ~ 1000,
TRUE ~ 1),
result = as.numeric(value) * modifier)
uni value modifier result
1 Yale 16 1e+06 1.6e+07
2 Toronto 240 1e+03 2.4e+05
3 NYU 7.5 1e+03 7.5e+03
Using gsub and dplyr:
df %>% mutate(
unit=gsub("[0-9]+\\.*[0-9]*","",students), #selecting unit
value=as.numeric(gsub("([0-9]+\\.*[0-9]+).", "\\1", students)),
students=ifelse(unit=="k",1e3*value,
ifelse(unit=="m",1e6*value,
ifelse(unit=="b",1e9*value,value)))) %>%
select(-c(unit,value))
One can write a function that does the conversion, for example:
f <- function(s) {
l <- nchar(s)
x <- as.numeric(substr(s, 1, l-1))
u <- substr(s, l, l)
x * 10^(3 * match(u, c("k", "M", "G")))
}
f("2M")
f("200k")
Edit: or a little bit more generic:
f <- function(s) {
x <- as.numeric(gsub("[kMG]", "", s))
u <- gsub("[0-9.]", "", s)
if (nchar(u)) x <- x * 10^(3 * match(u, c("k", "M", "G")))
x
}
f("20")
f("2M")
f("200k")
As I am new to the data.table package, I would like to replicate what I would normally do in a data.frame structure below, to a data.table structure.
Dta <- data.frame(Customer = c("Javier","Oscar","Ivan","Peter"),Type_of_Customer=LETTERS[c(1,1:3)])
Dtb <- data.frame(Customer = c("Javier","Oscar","Ivan","Jack"),Zone=5:8,District=100:103)
Result <- cbind(Dtb[match(Dtb[,"Customer"],Dta[,"Customer"]),c("Zone","District")],Dta)
ww <- which(is.na(Result[,"Zone"]))
if(length(ww) > 0){
Result[ww,"Zone"] <- "Not in Dtb"
}
ww <- which(is.na(Result[,"District"]))
if(length(ww) > 0){
Result[ww,"District"] <- "Not in Dtb"
}
So If I had Dta and Dtb as data.table structure, what would be the way to go?
(Note: In the real sample I have around 10 million rows so I would need the more time-efficient solution)
Dta <- data.table(Custumer = c("Javier","Oscar","Ivan","Peter"),Type_of_Customer=LETTERS[c(1,1:3)])
Dtb <- data.table(Custumer = c("Javier","Oscar","Ivan","Jack"),Zone=5:8,District=100:103)
Thanks.
We can use a join on thee 'Custumer' and replace the NA elements with 'Not in 'Dtb' string
Dtb[Dta, on = .(Custumer)][, c("Zone", "District") :=
.(as.character(Zone), as.character(District))
][is.na(Zone), c("Zone", "District") := "Not in Dtb"][]
# Custumer Zone District Type_of_Customer
#1: Javier 5 100 A
#2: Oscar 6 101 A
#3: Ivan 7 102 B
#4: Peter Not in Dtb Not in Dtb C
Given a set of lines, I have to find maximum occurrence of words(need not be single word, can be set of words also.)
say, I have a text like,
string <- "He is john beck. john beck is working as an chemical engineer. Most of the chemical engineers are john beck's friend"
I want output to be,
john beck - 3
chemical engineer - 2
Is there any function or package which does this?
Try this:
string <- "He is john beck. john beck is working as an chemical engineer. Most of the chemical engineers are john beck's friend"
library(tau)
library(tm)
tokens <- MC_tokenizer(string)
tokens <- tokens[tokens != ""]
string_ <- paste(stemCompletion(stemDocument(tokens), tokens), collapse = " ")
## if you want only bi-grams:
tab <- sort(textcnt(string_, method = "string", n = 2), decreasing = TRUE)
data.frame(Freq = tab[tab > 1])
# Freq
# john beck 3
# chemical engineer 2
## if you want uni-, bi- and tri-grams:
nmin <- 1; nmax <- 3
tab <- sort(do.call(c, lapply(nmin:nmax, function(x) textcnt(string_, method = "string", n = x) )), decreasing = TRUE)
data.frame(Freq = tab[tab > 1])
# Freq
# beck 3
# john 3
# john beck 3
# chemical 2
# engineer 2
# is 2
# chemical engineer 2
Could also try this, using the quanteda package:
require(quanteda)
mydfm <- dfm(string, ngrams = 1:2, concatenator = "_", stem = TRUE, verbose = FALSE)
topfeatures(mydfm)
## beck john john_beck chemic chemical_engin engin is
## 3 3 3 2 2 2 2
## an an_chem are
## 1 1 1
You lose the stems, but this counts "john beck" three times instead of just two (since without stemming, "john beck's" will be a separate type).
It's simpler though!
Below is the code I am trying to implement. I want to extract this 10 consecutive values of rows and turn them into corresponding columns .
This is how data looks like: https://drive.google.com/file/d/0B7huoyuu0wrfeUs4d2p0eGpZSFU/view?usp=sharing
I have been trying but temp1 and temp2 comes out to be empty. Please help.
library(Hmisc) #for increment function
myData <- read.csv("Clothing_&_Accessories.csv",header=FALSE,sep=",",fill=TRUE) # reading the csv file
extract<-myData$V2 # extracting the desired column
x<-1
y<-1
temp1 <- NULL #initialisation
temp2 <- NULL #initialisation
data.sorted <- NULL #initialisation
limit<-nrow(myData) # Calculating no of rows
while (x! = limit) {
count <- 1
for (count in 11) {
if (count > 10) {
inc(x) <- 1
break # gets out of for loop
}
else {
temp1[y]<-data_mat[x] # extracting by every row element
}
inc(x) <- 1 # increment x
inc(y) <- 1 # increment y
}
temp2<-temp1
data.sorted<-rbind(data.sorted,temp2) # turn rows into columns
}
Your code is too complex. You can do this using only one for loop, without external packages, likes this:
myData <- as.data.frame(matrix(c(rep("a", 10), "", rep("b", 10)), ncol=1), stringsAsFactors = FALSE)
newData <- data.frame(row.names=1:10)
for (i in 1:((nrow(myData)+1)/11)) {
start <- 11*i - 10
newData[[paste0("col", i)]] <- myData$V1[start:(start+9)]
}
You don't actually need all this though. You can simply remove the empty lines, split the vector in chunks of size 10 (as explained here) and then turn the list into a data frame.
vec <- myData$V1[nchar(myData$V1)>0]
as.data.frame(split(vec, ceiling(seq_along(vec)/10)))
# X1 X2
# 1 a b
# 2 a b
# 3 a b
# 4 a b
# 5 a b
# 6 a b
# 7 a b
# 8 a b
# 9 a b
# 10 a b
We could create a numeric index based on the '' values in the 'V2' column, split the dataset, use Reduce/merge to get the columns in the wide format.
indx <- cumsum(myData$V2=='')+1
res <- Reduce(function(...) merge(..., by= 'V1'), split(myData, indx))
res1 <- res[order(factor(res$V1, levels=myData[1:10, 1])),]
colnames(res1)[-1] <- paste0('Col', 1:3)
head(res1,3)
# V1 Col1 Col2 Col3
#2 ProductId B000179R3I B0000C3XXN B0000C3XX9
#4 product_title Amazon.com Amazon.com Amazon.com
#3 product_price unknown unknown unknown
From the p1.png, the 'V1' column can also be the column names for the values in 'V2'. If that is the case, we can 'transpose' the 'res1' except the first column and change the column names of the output with the first column of 'res1' (setNames(...))
res2 <- setNames(as.data.frame(t(res1[-1]), stringsAsFactors=FALSE),
res1[,1])
row.names(res2) <- NULL
res2[] <- lapply(res2, type.convert)
head(res2)
# ProductId product_title product_price userid
#1 B000179R3I Amazon.com unknown A3Q0VJTU04EZ56
#2 B0000C3XXN Amazon.com unknown A34JM8F992M9N1
#3 B0000C3XX9 Amazon.com unknown A34JM8F993MN91
# profileName helpfulness reviewscore review_time
#1 Jeanmarie Kabala "JP Kabala" 7/7 4 1182816000
#2 M. Shapiro 6/6 5 1205107200
#3 J. Cruze 8/8 5 120571929
# review_summary
#1 Periwinkle Dartmouth Blazer
#2 great classic jacket
#3 Good jacket
# review_text
#1 I own the Austin Reed dartmouth blazer in every color
#2 This is the second time I bought this jacket
#3 This is the third time I bought this jacket
I guess this is just a reshaping issue. In that case, we can use dcast from data.table to convert from long to wide format
library(data.table)
DT <- dcast(setDT(myData)[V1!=''][, N:= paste0('Col', 1:.N) ,V1], V1~N,
value.var='V2')
data
myData <- structure(list(V1 = c("ProductId", "product_title",
"product_price",
"userid", "profileName", "helpfulness", "reviewscore", "review_time",
"review_summary", "review_text", "", "ProductId", "product_title",
"product_price", "userid", "profileName", "helpfulness",
"reviewscore",
"review_time", "review_summary", "review_text", "", "ProductId",
"product_title", "product_price", "userid", "profileName",
"helpfulness",
"reviewscore", "review_time", "review_summary", "review_text"
), V2 = c("B000179R3I", "Amazon.com", "unknown", "A3Q0VJTU04EZ56",
"Jeanmarie Kabala \"JP Kabala\"", "7/7", "4", "1182816000",
"Periwinkle Dartmouth Blazer",
"I own the Austin Reed dartmouth blazer in every color", "",
"B0000C3XXN", "Amazon.com", "unknown", "A34JM8F992M9N1",
"M. Shapiro",
"6/6", "5", "1205107200", "great classic jacket",
"This is the second time I bought this jacket",
"", "B0000C3XX9", "Amazon.com", "unknown", "A34JM8F993MN91",
"J. Cruze", "8/8", "5", "120571929", "Good jacket",
"This is the third time I bought this jacket"
)), .Names = c("V1", "V2"), row.names = c(NA, 32L),
class = "data.frame")