Multiple matching function in r - r

I am trying to match two datasets using the following variables School (unique) with classes that need teachers. Some teachers have one specialty, some have more than one. I have been trying to use the match() and which( %in% ) base functions but I cannot get it to search for all the possible teacher matches. It always stops after the first match. Here is some sample data:
class<-c("english","history","art","art","math","history","art")
school<-c("C.H.S.","B.H.S.","D.H.S.","A.H.S.","Z.H.S.","M.H.S.","L.H.S.")
specialty<-c("math","history","English","history","literature","art","English")
teacher<-c("Jill","Jill","Sam","Liz","Liz","Liz","Rob")
teacher.skills<-data.frame(teacher, specialty)
school.needs<-data.frame(school,class)
teacher.match<-data.frame(Jill,Sam,Rob,Liz)
The final result would look like this:
Jill<-c("No","Yes","No","No","Yes","Yes","No")
Sam<-c("Yes","No","No","No","No","No","No")
Liz<-c("No","Yes","Yes","Yes","No","Yes","Yes")
Rob<-c("Yes","No","No","No","No","No","No")
match.result<-data.frame(school.needs, teacher.match)
match.result
I have even tried working on a little function like this but still can't get the final formatting right.
source.1<-school.needs
source.2<-teacher.skills
dist.name<-adist(source.1$class, source.2$specialty, partial = FALSE, ignore.case = TRUE)
min.name<-apply(dist.name, 1, min)
school.teacher.match<-NULL
for(i in 1:nrow(dist.name))
{
skills.ref<-match(min.name[i], dist.name[i,])
school.ref<-i
school.teacher.match<-rbind(data.frame(skills.ref=skills.ref, school.ref=school.ref, Teacher=source.2[skills.ref,]$teacher, Class=source.1[school.ref,]$class, School=source.1[school.ref,]$school, adist=min.name[i]), school.teacher.match)
school.teacher.match<-subset(school.teacher.match, school.teacher.match$adist==0)
}
school.teacher.match
Any help would be much appreciated, thanks!

Note that I had to modify your input data to change "English" to "english" for each match. The data is given by:
school.needs <- structure(list(school = structure(c(3L, 2L, 4L, 1L, 7L, 6L, 5L
), .Label = c("A.H.S.", "B.H.S.", "C.H.S.", "D.H.S.", "L.H.S.",
"M.H.S.", "Z.H.S."), class = "factor"), class = structure(c(2L,
3L, 1L, 1L, 4L, 3L, 1L), .Label = c("art", "english", "history",
"math"), class = "factor")), .Names = c("school", "class"), row.names = c(NA,
-7L), class = "data.frame")
teacher.skills <- structure(list(teacher = structure(c(1L, 1L, 4L, 2L, 2L, 2L,
3L), .Label = c("Jill", "Liz", "Rob", "Sam"), class = "factor"),
specialty = structure(c(5L, 3L, 2L, 3L, 4L, 1L, 2L), .Label = c("art",
"english", "history", "literature", "math"), class = "factor")), .Names = c("teacher",
"specialty"), row.names = c(NA, -7L), class = "data.frame")
Using merge and dcast from reshape2 (or data.table):
library(reshape2)
## use merge to match needs to skills
m <- merge(school.needs,teacher.skills,by.x="class",by.y="specialty")
m$val <- "Yes" ## add a column for the "Yes"
## go to wide format for the final result filling NA with "No"
result <- dcast(m,school+class~teacher,value.var="val",fill="No")
## school class Jill Liz Rob Sam
##1 A.H.S. art No Yes No No
##2 B.H.S. history Yes Yes No No
##3 C.H.S. english No No Yes Yes
##4 D.H.S. art No Yes No No
##5 L.H.S. art No Yes No No
##6 M.H.S. history Yes Yes No No
##7 Z.H.S. math Yes No No No

Here's how I'd do it:
(data)
schools <- data.frame(
school = c("C.H.S.", "B.H.S.", "D.H.S.", "A.H.S.","Z.H.S.", "M.H.S.", "L.H.S."),
class = c("english", "history", "art", "art", "math", "history", "art"),
stringsAsFactors = F)
teachers <- data.frame(
teacher = c("Jill", "Jill", "Sam", "Liz", "Liz", "Liz", "Rob"),
specialty = c("math", "history", "English", "history", "literature", "art", "English"),
stringsAsFactors = F)
(key concepts)
# you can get the specialties of a given teacher like this:
subset(teachers, teacher == 'Jill')$specialty
# [1] "math" "history"
# you can get the set of unique teachers like this:
unique(teachers$teacher)
# [1] "Jill" "Sam" "Liz" "Rob"
(solution)
# for each teacher, do any of their specialties match the class need of each school?
matches <-
sapply(unique(teachers$teacher), function(this_t) {
specs <- subset(teachers, teacher == this_t)$specialty
schools$class %in% specs
})
# combine with school data.frame
data.frame(schools, matches)
# school class Jill Sam Liz Rob
# 1 C.H.S. english FALSE FALSE FALSE FALSE
# 2 B.H.S. history TRUE FALSE TRUE FALSE
# 3 D.H.S. art FALSE FALSE TRUE FALSE
# 4 A.H.S. art FALSE FALSE TRUE FALSE
# 5 Z.H.S. math TRUE FALSE FALSE FALSE
# 6 M.H.S. history TRUE FALSE TRUE FALSE
# 7 L.H.S. art FALSE FALSE TRUE FALSE
Some notes:
1) It's way easier to read (and think about) when you include appropriate spacing in your code. Also, rather than create a bunch of vectors and then assemble into data.frames, do this in one step -- it's shorter, it helps show how the vectors relate to each other, and it won't clutter your global environment.
2) I'm leaving the match values as FALSE/TRUE, because since this is boolean data, it makes sense to use the appropriate data type. If you really want No/Yes, though, you can change these values into factors with those labels
3) The results are a little bit different than what you expected because 'English' == 'english' is FALSE. You might want to clean up your starting data. If you know that cases will be mixed and you case-insensitive matching, you can coerce all values to lowercase before comparing: tolower(schools$class) %in% tolower(specs)

Related

Is there a way in R to convert the following character variable?

I have the following dataframe with a character variable that represents the number of lanes on a highway, can I replace this vector with a similar vector that has numbers instead of letter?
df<- structure(list(Blocked.Lanes = c("|RS|RS|ML|", "|RS|", "|RS|ML|ML|ML|ML|",
"|RS|", "|RS|RE|", "|ML|ML|ML|", "|RS|ML|", "|RS|", "|ML|ML|ML|ML|ML|ML|",
"|RS|ML|ML|"), Event.Id = c(240314L, 240381L, 240396L, 240796L,
240948L, 241089L, 241190L, 241225L, 241226L, 241241L)), row.names = c(NA,
10L), class = "data.frame")
The output should be something like df2 below:
df2<- structure(list(Blocked.Lanes = c(3L, 1L, 5L, 1L, 2L, 3L, 2L,
1L, 6L, 3L), Event.Id = c(240314L, 240381L, 240396L, 240796L,
240948L, 241089L, 241190L, 241225L, 241226L, 241241L)), class = "data.frame", row.names = c(NA,
-10L))
One way would be to count number of "|" in each string. We subtract it with - 1 since there is an additional "|".
stringr::str_count(df$Blocked.Lanes, '\\|') - 1
#[1] 3 1 5 1 2 3 2 1 6 3
In base R :
lengths(gregexpr("\\|", df$Blocked.Lanes)) - 1
Another way would to be count exact words in the string.
stringr::str_count(df$Blocked.Lanes, '\\w+')
lengths(gregexpr("\\w+", df$Blocked.Lanes))
Similar to Ronak's solution you could also do:
stringr:str_count(df$Blocked.Lanes, "\\b[A-Z]{2}\\b")
if the lanes are always 2 letters long, or
stringr:str_count(df$Blocked.Lanes, "\\b[A-Z]+\\b")
if the lanes are always at least one letter long.
stringr:str_count(df$Blocked.Lanes, "(?<=\\|)[A-Z]+(?=\\|)")
also works.
Not as succinct as #Ronak Shah's, but another method in Base R.
String split on string literal "|" and then count elements:
df2 <- transform(df, Blocked.Lanes = lengths(Map(function(x) x[x != ""],
strsplit(df$Blocked.Lanes, "|", fixed = TRUE))))

how can I group based on similarity in strings

I have a data like this
df <-structure(list(label = structure(c(5L, 6L, 7L, 8L, 3L, 1L, 2L,
9L, 10L, 4L), .Label = c(" holand", " holandindia", " Holandnorway",
" USAargentinabrazil", "Afghanestan ", "Afghanestankabol", "Afghanestankabolindia",
"indiaAfghanestan ", "USA", "USAargentina "), class = "factor"),
value = structure(c(5L, 4L, 1L, 9L, 7L, 10L, 6L, 3L, 2L,
8L), .Label = c("1941029507", "2367321518", "2849255881",
"2913128511", "2927576083", "4550996370", "457707181.9",
"637943892.6", "796495286.2", "89291651.19"), class = "factor")), .Names = c("label",
"value"), class = "data.frame", row.names = c(NA, -10L))
I want to get the largest name (in letter) and then see how many smaller and similar names are and assign them to a group
then go for another next large name and assign them to another group
until no group left
at first I calculate the length of each so I will have the length of them
library(dplyr)
dft <- data.frame(names=df$label,chr=apply(df,2,nchar)[,1])
colnames(dft)[1] <- "label"
df2 <- inner_join(df, dft)
Now I can simply find which string is the longest
df2[which.max(df2$chr),]
Now I should see which other strings have the letters similar to this long string . we have these possibilities
Afghanestankabolindia
it can be
A
Af
Afg
Afgh
Afgha
Afghan
Afghane
.
.
.
all possible combinations but the order of letter should be the same (from left to right) for example it should be Afghand cannot be fAhg
so we have only two other strings that are similar to this one
Afghanestan
Afghanestankabol
it is because they should be exactly similar and not even a letter different (more than the largest string) to be assigned to the same group
The desire output for this is as follows:
label value group
Afghanestan 2927576083 1
Afghanestankabol 2913128511 1
Afghanestankabolindia 1941029507 1
indiaAfghanestan 796495286.2 2
Holandnorway 457707181.9 3
holand 89291651.19 3
holandindia 4550996370 3
USA 2849255881 4
USAargentina 2367321518 4
USAargentinabrazil 637943892.6 4
why indiaAfghanestan is a seperate group? because it does not completely belong to another name (it has partially name from one or another). it should be part of a bigger name
I tried to use this one Find similar strings and reconcile them within one dataframe which did not help me at all
I found something else which maybe helps
require("Biostrings")
pairwiseAlignment(df2$label[3], df2$label[1], gapOpening=0, gapExtension=4,type="overlap")
but still I don't know how to assign them into one group
You could try
library(magrittr)
df$label %>%
tolower %>%
trimws %>%
stringdist::stringdistmatrix(method = "jw", p = 0.1) %>%
as.dist %>%
`attr<-`("Labels", df$label) %>%
hclust %T>%
plot %T>%
rect.hclust(h = 0.3) %>%
cutree(h = 0.3) %>%
print -> df$group
df
# label value group
# 1 Afghanestan 2927576083 1
# 2 Afghanestankabol 2913128511 1
# 3 Afghanestankabolindia 1941029507 1
# 4 indiaAfghanestan 796495286.2 2
# 5 Holandnorway 457707181.9 3
# 6 holand 89291651.19 3
# 7 holandindia 4550996370 3
# 8 USA 2849255881 4
# 9 USAargentina 2367321518 4
# 10 USAargentinabrazil 637943892.6 4
See ?stringdist::'stringdist-metrics' for an overview of the string dissimilarity measures offered by stringdist.

Behavior of function "identical" with factors

R stores factors as integers. Therefore, when using the function identical, it cannot find when two factors are of the same name if they have different levels.
Here's an MWE:
y <- structure(list(portfolio_date = structure(c(1L, 1L, 1L, 2L, 2L,
2L), .Label = c("2000-10-31", "2001-04-30"), class = "factor"),
security = structure(c(2L, 2L, 1L, 3L, 2L, 4L), .Label = c("Currency Australia (Fwd)",
"Currency Euro (Fwd)", "Currency Japan (Fwd)", "Currency United Kingdom (Fwd)"
), class = "factor")), .Names = c("portfolio_date", "security"
), row.names = c(10414L, 10417L, 10424L, 21770L, 21771L, 21774L
), class = "data.frame")
x <- structure(list(portfolio_date = structure(1L, .Label = "2000-10-31", class = "factor"),
security = structure(1L, .Label = "Currency Euro (Fwd)", class = "factor")),
.Names = c("portfolio_date", "security"), row.names = 10414L, class = "data.frame")
identical(y[1,], x)
Returns FALSE
But if we look at the objects, they appear identical to the user
y[1,]
portfolio_date security
10414 2000-10-31 Currency Euro (Fwd)
x
portfolio_date security
10414 2000-10-31 Currency Euro (Fwd)
Ultimately I want to be able to do something like the following:
apply(y, 1, identical, x)
10414 10417 10424 21770 21771 21774
TRUE TRUE FALSE FALSE FALSE FALSE
which(apply(y, 1, identical, x))
1 2
Any suggestions as to how to achieve this? Thanks.
One option is to use the rowwise from dplyr to check row-by-row; If you need to compare the row.names at the same time then you need to create an id column for both, otherwise, it will return TRUE for the first two rows.
library(dplyr)
x$id <- row.names(x)
y$id <- row.names(y)
rowwise(y) %>% do(check = isTRUE(all.equal(., x, check.attributes = F))) %>% data.frame
check
1 TRUE
2 FALSE
3 FALSE
4 FALSE
5 FALSE
6 FALSE
In order to perform the comparison, the factors need to be converted into character objects.
By using base R alone here is a solution:
apply(apply(y, 2, as.character), 1, identical, apply(x, 2, as.character))
The inner apply loops convert each column in the source and target data frames to character objects and the outer apply loops through the rows.
If the x data frame has more than one row, the actual behavior may not be as expected.
Use the package 'compare'.
library(compare)
result <- NULL
for (i in 1:NROW(y)){
one <- compare(y[i,], x, dropLevels=T)
two <- one$detailedResult[1]==T & one$detailedResult[2]==T
result <- c(result, two)
}
as.character(result)#TRUE TRUE FALSE FALSE FALSE FALSE
Solution for data posted in OP
The example posted in the OP can be easily treated by using droplevels().
Let us first look at why the comparison identical(y[1,], x) returns FALSE:
str(y[1,])
#'data.frame': 1 obs. of 2 variables:
#$ portfolio_date: Factor w/ 2 levels "2000-10-31","2001-04-30": 1
#$ security : Factor w/ 4 levels "Currency Australia (Fwd)",..: 2
whereas
str(x)
#'data.frame': 1 obs. of 2 variables:
#$ portfolio_date: Factor w/ 1 level "2000-10-31": 1
#$ security : Factor w/ 1 level "Currency Euro (Fwd)": 1
So the difference lies in the factors, even though both objects are displayed in the same way, as shown in the OP's question.
This is where the function droplevels() is useful: it removes unused factors. By applying droplevels() to y[1,] with its redundant factors, we obtain:
identical(droplevels(y[1,]), x)
#[1] TRUE
If x also contains unused factors, it will be necessary to wrap it into droplevels(), too. In any case, it won't do any harm:
identical(droplevels(y[1,]), droplevels(x))
#[1] TRUE
General solution
Using droplevels() may not work if the real data is more complex than the data posted in the "MWE" of the OP. Such situations may include, e.g., equivalent entries in x and y[1,] that are stored as different factor levels. An example where droplevels() fails is given in the data section at the end of this answer.
The following solution represents an efficient possibility to treat such general situations. It works for the data posted in the OP as well as for the more complicated case of the data posted below.
First, two auxiliary vectors are created that contain only the characters of each row. By using paste() we can concatenate each row to a single character string:
temp_x <- apply(x, 1, paste, collapse=",")
temp_y <- apply(y, 1, paste, collapse=",")
With these vectors it becomes easily possible to compare rows of the original data.frames, even if the entries were originally stored as factors with different levels and numbering.
To identify which rows are identical, we can use the %in% operator, which is more appropriate than the function identical() in this case, as the former checks for equality of all possible row combinations, and not just individual pairs.
With these simple modifications the desired output can be obtained quickly and without further loops:
setNames(temp_y %in% temp_x, names(temp_y))
#10414 10417 10424 21770 21771 21774
# TRUE TRUE FALSE FALSE FALSE FALSE
which(temp_y %in% temp_x)
#[1] 1 2
y[temp_y %in% temp_x,]
# portfolio_date security
#10414 2000-10-31 Currency Euro (Fwd)
#10417 2000-10-31 Currency Euro (Fwd)
data
x <- structure(list(portfolio_date = structure(1:2, .Label = c("2000-05-15",
"2000-10-31"), class = "factor"), security = structure(c(2L, 1L),
.Label = c("Currency Euro (Fwd)", "Currency USD (Fwd)"),
class = "factor")), .Names = c("portfolio_date", "security"),
class = "data.frame", row.names = c("10234", "10414"))
y <- structure(list(portfolio_date = structure(c(1L, 1L, 1L, 2L, 2L, 2L),
.Label = c("2000-10-31", "2001-04-30"), class = "factor"),
security = structure(c(2L, 2L, 1L, 3L, 2L, 4L),
.Label = c("Currency Australia (Fwd)", "Currency Euro (Fwd)",
"Currency Japan (Fwd)", "Currency United Kingdom (Fwd)"),
class = "factor")), .Names = c("portfolio_date", "security"),
row.names = c(10414L, 10417L, 10424L, 21770L, 21771L, 21774L),
class = "data.frame")

R: Finding out the corresponding value for a category in a data frame

My original data has about 1000 observations and has the following variables.
$Nationality : Factor "American" "Korean" ...
$Food : Factor "Milk" "Fruits" "Rice"
$No. of servings : num 5 6 3
I wanted to construct a table, which shows for $Nationality == American, what is the $Food that they eat, and its corresponding $No. of servings.
Since my original data is huge, i tried to first subset the data using:
American = subset(originaldata, $Nationality == "American"), to create a data frame which contain records of American nationality only.
Then i applied the table ( ) function on the subsetted data (i.e. American) using: table(American$Food, American$No. of servings)
The results, instead of just containing $Nationality == "American" records, had also contained all other Nationality records.
Why is this so? Is there any method to work around with this problem? I want a table which only contains records of Nationality == American, showing data on $Food and $No. of servings in two columns.
You can split your data by nationality and then extract 'American',
list1 <- split(originaldata, originaldata$Nationality)
list1$American
# Nationality Food No.ofServings
#1 American Fruits 3
#2 American rise 5
#5 American pasta 9
DATA
dput(originaldata)
structure(list(Nationality = structure(c(1L, 1L, 3L, 2L, 1L), .Label = c("American",
"British", "Korean"), class = "factor"), Food = structure(c(1L,
4L, 2L, 1L, 3L), .Label = c("Fruits", "meat", "pasta", "rise"
), class = "factor"), No.ofServings = c(3, 5, 6, 2, 9)), .Names = c("Nationality",
"Food", "No.ofServings"), row.names = c(NA, -5L), class = "data.frame")
or with the dplyr package:
install.packages("dplyr")
library(dplyr)
AmericanData = filter(yourdata, Nationality == "American")
For Large scale data use data.table .
if I understand your problem correctly then it should be achievable by following
library(data.table)
dt= as.data.table(your_data)
dt[,.SD,Nationality]
with the data that #sotos provided it would look like
dt <- as.data.table(x)
> dt[,.SD,Nationality]
Nationality Food No.ofServings
1: American Fruits 3
2: American rise 5
3: American pasta 9
4: Korean meat 6
5: British Fruits 2
filtering is easy peasy
> dt[Nationality=="American"]
Nationality Food No.ofServings
1: American Fruits 3
2: American rise 5
3: American pasta 9

split dataset by day and save it as data frame

I have a dataset with 2 months of data (month of Feb and March). Can I know how can I split the data into 59 subsets of data by day and save it as data frame (28 days for Feb and 31 days for Mar)? Preferably to save the data frame in different name according to the date, i.e. 20140201, 20140202 and so forth.
df <- structure(list(text = structure(c(4L, 6L, 5L, 2L, 8L, 1L), .Label = c(" Terpilih Jadi Maskapai dengan Pelayanan Kabin Pesawat cont",
"booking number ZEPLTQ I want to cancel their flight because they can not together with my wife and kids",
"Can I change for the traveler details because i choose wrongly for the Mr or Ms part",
"cant do it with cards either", "Coming back home AK", "gotta try PNNL",
"Jadwal penerbangan medanjktsblm tangalmasi ada kah", "Me and my Tart would love to flyLoveisintheAir",
"my flight to Bangkok onhas been rescheduled I couldnt perform seat selection now",
"Pls checks his case as money is not credited to my bank acctThanks\n\nCASLTP",
"Processing fee Whatt", "Tacloban bound aboardto get them boats Boats boats boats Tacloban HeartWork",
"thanks I chatted with ask twice last week and told the same thing"
), class = "factor"), created = structure(c(1L, 1L, 2L, 2L, 3L,
3L), .Label = c("1/2/2014", "2/2/2014", "5/2/2014", "6/2/2014"
), class = "factor")), .Names = c("text", "created"), row.names = c(NA,
6L), class = "data.frame")
You don't need to output multiple dataframes. You only need to select/subset them by year&month of the 'created' field. So here are two ways do do that: 1. is simpler if you don't plan on needing any more date-arithmetic
# 1. Leave 'created' a string, just use text substitution to extract its month&date components
df$created_mthyr <- gsub( '([0-9]+/)[0-9]+/([0-9]+)', '\\1\\2', df$created )
# 2. If you need to do arbitrary Date arithmetic, convert 'created' field to Date object
# in this case you need an explicit format-string
df$created <- as.Date(df$created, '%M/%d/%Y')
# Now you can do either a) split
split(df, df$created_mthyr)
# specifically if you want to assign the output it creates to 3 dataframes:
df1 <- split(df, df$created_mthyr)[[1]]
df2 <- split(df, df$created_mthyr)[[2]]
df5 <- split(df, df$created_mthyr)[[3]]
# ...or else b) do a Split-Apply-Combine and perform arbitrary command on each separate subset. This is very powerful. See plyr/ddply documentation for examples.
require(plyr)
df1 <- dlply(df, .(created_mthyr))[[1]]
df2 <- dlply(df, .(created_mthyr))[[2]]
df5 <- dlply(df, .(created_mthyr))[[3]]
# output looks like this - strictly you might not want to keep 'created','created_mthyr':
> df1
# text created created_mthyr
#1 cant do it with cards either 1/2/2014 1/2014
#2 gotta try PNNL 1/2/2014 1/2014
> df2
#3
#Coming back home AK
#4 booking number ZEPLTQ I want to cancel their flight because they can not together with my wife and kids
# created created_mthyr
#3 2/2/2014 2/2014
#4 2/2/2014 2/2014

Resources