select a row based on another row in a data frame - r

My data looks like this
df <- structure(list(V1 = 1:15, V2 = structure(c(5L, 9L, 7L, 8L, 10L,
2L, 13L, 3L, 11L, 12L, 15L, 1L, 4L, 14L, 6L), .Label = c("A0A087WNY6",
"B2RTL5", "B8JJX9", "D3Z2H7", "E9PZ97", "G3UWX1", "Q2VWQ4", "Q3TMB5",
"Q3TWK2", "Q6ZPS9", "Q7TMW3", "Q8BP71", "Q8R4K2", "Q925B0", "Q9WU01"
), class = "factor"), V3 = c(5L, 7L, 10L, 11L, 13L, 15L, NA,
NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("V1", "V2", "V3"
), class = "data.frame", row.names = c(NA, -15L))
I want to select the rows from the first two columns based on third column values
the expected output is this
5 Q6ZPS9
7 Q8R4K2
10 Q8BP71
11 Q9WU01
13 D3Z2H7
15 G3UWX1

I feel like V3 should not be in this dataframe but a different vector. But here is a way
df[df$V1 %in% df$V3,1:2]

Related

Count letter frequencies in descending order

I have a data looks like this
df<-structure(list(col = structure(c(9L, 2L, 13L, 11L, 5L, 7L, 10L,
6L, 8L, 3L, 12L, 4L, 1L), .Label = c("HHRGGVCTS", "MGSSN", "MVKTTYYDVG",
"RRHYNGAYDD", "RTSTN", "S", "SNCWC", "sp|P31689|DNJA1_HUMAN DnaJ homolog GN=DNAJA1 PE=1 SV=2 ",
"sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1", "THYDT", "TVHAV",
"VCMCVVDDNR", "YATTA"), class = "factor")), class = "data.frame", row.names = c(NA,
-13L))
I am trying to count letter frequencies. There are 20 possible letters which I want to count in each row.
For example,
the first row: row starts with sp| so character frequencies are not calculated and result is the original string
the second row: doesn't start with sp| so it will show character frequencies
MGSSN 2,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
which means, there are 2 S, 1, M, 1, G, 1, N and the other letters are empty .
The character frequencies are ordered in descending order.
The final output would look like the following
output<-structure(list(col = structure(c(9L, 2L, 13L, 11L, 5L, 7L, 10L,
6L, 8L, 3L, 12L, 4L, 1L), .Label = c("HHRGGVCTS", "MGSSN", "MVKTTYYDVG",
"RRHYNGAYDD", "RTSTN", "S", "SNCWC", "sp|P31689|DNJA1_HUMAN DnaJ homolog GN=DNAJA1 PE=1 SV=2 ",
"sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1", "THYDT", "TVHAV",
"VCMCVVDDNR", "YATTA"), class = "factor"), Col2 = structure(c(8L,
2L, 3L, 2L, 2L, 2L, 2L, 1L, 7L, 5L, 6L, 5L, 4L), .Label = c("1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0",
"2,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0", "2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0",
"2,2,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0", "2,2,2,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0",
"3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0", "sp|P31689|DNJA1_HUMAN DnaJ homolog GN=DNAJA1 PE=1 SV=2 ",
"sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1"), class = "factor")), class = "data.frame", row.names = c(NA,
-13L))
We can use str_count
library(stringr)
i1 <- !grepl("^sp", df$col)
df$col2[i1] <- sapply(as.character(df$col[i1]), function(x)
paste(sort(str_count(x, LETTERS), decreasing = TRUE), collapse=", "))
df$col2[!i1] <- df$col[!i1]
Or instead of keeping as a string, it can be a list column as well
library(tidyverse)
df %>%
mutate(col = as.character(col),
col2 = map(col, ~ if(str_detect(.x, "^sp")) .x
else str_count(.x, LETTERS) %>%
sort(decreasing = TRUE)))

'mydata$column <- NULL' not working in function

This is the function:
remove_column <- function(column_vector) {
for (column in column_vector) {
if (grepl('.y$', column)) {
mydata$column <- NULL
}
}
}
What I think it'd doing: I'm passing a vector of my column names to the function, it's looping through list of names and asking whether the last characters of each column name are ".y". If that is the case, the function eliminates the column.
I've tried putting prints here and there to see my vector and to see whether the conditional evaluates to TRUE or FALSE, and everything seems to be working fine, but for some reason, it doesn't get rid of the column.
The following function returns my column vector:
duplicate_names <- function(col_names) {
duplicates <- c()
for (name in col_names) {
# split by period i.e. colname.x would be [colname, x]
if (lengths(strsplit(name, '\\.')) > 1) {
duplicates <- c(duplicates, name)
}
}
return(duplicates)
}
I usually call it like this:
duplicate_names(names(mydata))
This is what the vector of columns looks like:
c('v1.x', 'v2.y')
When I print the function it returns the following:
[1] "v1.x" "v2.y"
As requested by a user, the dput(droplevels(horsedata[1:5, 1:5])) (data that I am using for this):
dput(droplevels(horsedata[1:5, 1:5]))
structure(list(ÿþhorse_name = structure(c(3L, 1L, 2L, 4L, 5L), .Label = c("IM PRETTY FAMES",
"JESS ROYAL BUCKS", "KISS ME IM SUGAR", "LOLAMO", "RUN MADISON RUN"
), class = "factor"), owner_name = structure(c(3L, 2L, 1L, 5L,
4L), .Label = c("Christine Tavares", "Heste Sport, Inc.", "Picov Cattle Co.",
"Procter, Wayne and Carol", "Ruth F. Barbour"), class = "factor"),
program = structure(1:5, .Label = c("1", "2", "3", "4", "5"
), class = "factor"), pp = 1:5, todays_cls = c(61L, 61L,
61L, 61L, 61L)), .Names = c("ÿþhorse_name", "owner_name",
"program", "pp", "todays_cls"), row.names = c(NA, 5L), class = "data.frame")
We don't need a loop to subset the columns.
mydata[!grepl('\\.y$', column_list)]
If there are other columns not in the column_list and we want to keep them (assuming that the 'column_list' is ordered)
mydata[setdiff(1:ncol(mydata), grep('\\.y$', column_list))]
We can modify your function by
changing .y$ to \\.y$ as . means any character and not just the dot
Instead of $, we use [ to subset the dataset
Return the dataset after the assignment
remove_column <- function(dat, column_vec) {
for (column in column_vec) {
if (grepl('\\.y$', column, perl=TRUE)) {
dat[column] <- NULL
}
}
dat
}
remove_column(mydata, column_list)
# v1.x v2.x v3
#1 6 1 9
#2 4 11 7
#3 14 15 5
#4 10 2 4
#5 13 4 0
#6 19 14 1
#7 5 1 8
#8 16 12 7
#9 16 13 5
#10 5 0 7
data
mydata <- structure(list(v1.x = c(6L, 4L, 14L, 10L, 13L, 19L, 5L, 16L,
16L, 5L), v1.y = c(12L, 7L, 14L, 14L, 6L, 18L, 4L, 0L, 10L, 2L
), v2.x = c(1L, 11L, 15L, 2L, 4L, 14L, 1L, 12L, 13L, 0L), v2.y = c(6L,
5L, 7L, 3L, 19L, 4L, 15L, 13L, 14L, 20L), v3 = c(9L, 7L, 5L,
4L, 0L, 1L, 8L, 7L, 5L, 7L)), .Names = c("v1.x", "v1.y", "v2.x",
"v2.y", "v3"), row.names = c(NA, -10L), class = "data.frame")
column_list <- c('v1.x', 'v1.y', 'v2.x', 'v2.y')

Converting object of class rules to data frame in R

I have an output of apriori function, which mines data and gives set of rules. I want to convert it to data frame for further processing.
The rules object looks like this:
> inspect(output)
lhs rhs support confidence lift
1 {curtosis=(846,1.27e+03]} => {skewness=(-0.254,419]} 0.2611233 0.8044944 2.418776
2 {variance=(892,1.34e+03]} => {notes.class=FALSE} 0.3231218 0.9888393 1.781470
3 {variance=(-0.336,446]} => {notes.class=TRUE} 0.2859227 0.8634361 1.940608
4 {skewness=(837,1.26e+03]} => {notes.class=FALSE} 0.2924872 0.8774617 1.580815
5 {entropy=(-0.155,386],
class=FALSE} => {skewness=(837,1.26e+03]} 0.1597374 0.9521739 2.856522
6 {variance=(-0.336,446],
curtosis=(846,1.27e+03]} => {skewness=(-0.254,419]} 0.1378556 0.8325991 2.503275
We can create rules object using data frame. Data frame looks like this:
> data
variance skewness curtosis entropy notes.class
1 (892,1.34e+03] (837,1.26e+03] (-0.268,424] (386,771] FALSE
2 (892,1.34e+03] (-0.254,419] (424,846] (771,1.16e+03] FALSE
3 (892,1.34e+03] (837,1.26e+03] (-0.268,424] (-0.155,386] FALSE
4 (446,892] (-0.254,419] (846,1.27e+03] (386,771] FALSE
Than we can get output variable using this:
> output <- apriori(data)
There was used arules package. dput(output) gives this:
new("rules"
, lhs = new("itemMatrix"
, data = new("ngCMatrix"
, i = c(8L, 2L, 0L, 5L, 9L, 12L, 0L, 8L, 0L, 3L, 0L, 8L, 8L, 13L, 8L,
10L, 3L, 10L, 8L, 11L, 8L, 13L, 3L, 12L, 2L, 5L, 2L, 6L, 2L,
5L, 2L, 6L, 2L, 10L, 2L, 7L, 2L, 11L, 0L, 3L, 0L, 10L, 0L, 7L,
11L, 13L, 5L, 6L, 6L, 12L, 5L, 10L, 1L, 5L, 4L, 6L, 6L, 13L,
0L, 3L, 8L, 0L, 8L, 13L, 3L, 8L, 13L, 0L, 3L, 13L, 2L, 5L, 6L,
2L, 5L, 12L, 2L, 6L, 12L)
, p = c(0L, 1L, 2L, 3L, 4L, 6L, 8L, 10L, 12L, 14L, 16L, 18L, 20L, 22L,
24L, 26L, 28L, 30L, 32L, 34L, 36L, 38L, 40L, 42L, 44L, 46L, 48L,
50L, 52L, 54L, 56L, 58L, 61L, 64L, 67L, 70L, 73L, 76L, 79L)
, Dim = c(14L, 38L)
, Dimnames = list(NULL, NULL)
, factors = list()
)
, itemInfo = structure(list(labels = structure(c("variance=(-0.336,446]",
"variance=(446,892]", "variance=(892,1.34e+03]", "skewness=(-0.254,419]",
"skewness=(419,837]", "skewness=(837,1.26e+03]", "curtosis=(-0.268,424]",
"curtosis=(424,846]", "curtosis=(846,1.27e+03]", "entropy=(-0.155,386]",
"entropy=(386,771]", "entropy=(771,1.16e+03]", "notes.class=FALSE",
"notes.class=TRUE"), class = "AsIs"), variables = structure(c(5L,
5L, 5L, 4L, 4L, 4L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L), .Label = c("curtosis",
"entropy", "notes.class", "skewness", "variance"), class = "factor"),
levels = structure(c(4L, 8L, 12L, 2L, 6L, 10L, 3L, 7L, 11L,
1L, 5L, 9L, 13L, 14L), .Label = c("(-0.155,386]", "(-0.254,419]",
"(-0.268,424]", "(-0.336,446]", "(386,771]", "(419,837]",
"(424,846]", "(446,892]", "(771,1.16e+03]", "(837,1.26e+03]",
"(846,1.27e+03]", "(892,1.34e+03]", "FALSE", "TRUE"), class = "factor")), .Names = c("labels",
"variables", "levels"), row.names = c(NA, -14L), class = "data.frame")
, itemsetInfo = structure(list(), .Names = character(0), row.names = integer(0), class = "data.frame")
)
, rhs = new("itemMatrix"
, data = new("ngCMatrix"
, i = c(3L, 12L, 13L, 12L, 5L, 3L, 8L, 13L, 0L, 3L, 8L, 3L, 3L, 8L,
6L, 5L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 3L, 12L, 5L,
12L, 12L, 13L, 4L, 13L, 3L, 0L, 8L, 12L, 6L, 5L)
, p = 0:38
, Dim = c(14L, 38L)
, Dimnames = list(NULL, NULL)
, factors = list()
)
, itemInfo = structure(list(labels = structure(c("variance=(-0.336,446]",
"variance=(446,892]", "variance=(892,1.34e+03]", "skewness=(-0.254,419]",
"skewness=(419,837]", "skewness=(837,1.26e+03]", "curtosis=(-0.268,424]",
"curtosis=(424,846]", "curtosis=(846,1.27e+03]", "entropy=(-0.155,386]",
"entropy=(386,771]", "entropy=(771,1.16e+03]", "notes.class=FALSE",
"notes.class=TRUE"), class = "AsIs"), variables = structure(c(5L,
5L, 5L, 4L, 4L, 4L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L), .Label = c("curtosis",
"entropy", "notes.class", "skewness", "variance"), class = "factor"),
levels = structure(c(4L, 8L, 12L, 2L, 6L, 10L, 3L, 7L, 11L,
1L, 5L, 9L, 13L, 14L), .Label = c("(-0.155,386]", "(-0.254,419]",
"(-0.268,424]", "(-0.336,446]", "(386,771]", "(419,837]",
"(424,846]", "(446,892]", "(771,1.16e+03]", "(837,1.26e+03]",
"(846,1.27e+03]", "(892,1.34e+03]", "FALSE", "TRUE"), class = "factor")), .Names = c("labels",
"variables", "levels"), row.names = c(NA, -14L), class = "data.frame")
, itemsetInfo = structure(list(), .Names = character(0), row.names = integer(0), class = "data.frame")
)
, quality = structure(list(support = c(0.261123267687819, 0.323121808898614,
0.285922684172137, 0.292487235594457, 0.159737417943107, 0.137855579868709,
0.137855579868709, 0.142231947483589, 0.142231947483589, 0.110138584974471,
0.110138584974471, 0.12399708242159, 0.153902261123268, 0.107221006564551,
0.13056163384391, 0.13056163384391, 0.150984682713348, 0.139314369073669,
0.100656455142232, 0.107221006564551, 0.154631655725748, 0.165572574762947,
0.112326768781911, 0.105762217359592, 0.12180889861415, 0.181619256017505,
0.181619256017505, 0.102844638949672, 0.105762217359592, 0.12837345003647,
0.12837345003647, 0.137855579868709, 0.137855579868709, 0.137855579868709,
0.137855579868709, 0.13056163384391, 0.13056163384391, 0.13056163384391
), confidence = c(0.804494382022472, 0.988839285714286, 0.863436123348018,
0.87746170678337, 0.952173913043478, 0.832599118942731, 0.832599118942731,
0.859030837004405, 0.898617511520737, 0.853107344632768, 0.915151515151515,
0.80188679245283, 0.972350230414747, 0.885542168674699, 0.864734299516908,
0.913265306122449, 1, 0.974489795918367, 1, 1, 0.990654205607477,
1, 0.980891719745223, 0.873493975903614, 0.814634146341463, 0.943181818181818,
0.950381679389313, 1, 0.92948717948718, 0.931216931216931, 0.897959183673469,
1, 0.969230769230769, 0.895734597156398, 0.832599118942731, 1,
0.864734299516908, 0.93717277486911), lift = c(2.41877587226493,
1.78146998779801, 1.94060807395104, 1.580814717477, 2.85652173913043,
2.50327498261071, 2.56515369004603, 1.93070701234925, 2.71366653809456,
2.56493458221826, 2.81948927477017, 2.41093594836147, 2.92344773223381,
2.72826587247868, 2.58853870008227, 2.73979591836735, 1.80157687253614,
1.75561827884899, 1.80157687253614, 1.80157687253614, 1.78473970550309,
2.24754098360656, 2.20459434060771, 1.96321350977681, 2.44926187419769,
1.69921455023295, 2.85114503816794, 1.80157687253614, 1.67454260588295,
2.09294821753838, 2.68799572230639, 2.24754098360656, 2.91406882591093,
2.70496064471679, 2.56515369004603, 1.80157687253614, 2.58853870008227,
2.81151832460733)), row.names = c(NA, 38L), .Names = c("support",
"confidence", "lift"), class = "data.frame")
, info = structure(list(data = data, ntransactions = 1371L, support = 0.1,
confidence = 0.8), .Names = c("data", "ntransactions", "support",
"confidence"))
)
We can't duplicate your data from your question (oh, you just added your data as I was typing this! Sorry!), so I'll use the example from the arules package:
library('arules');
data("Adult")
## Mine association rules.
rules <- apriori(Adult,
parameter = list(supp = 0.5, conf = 0.9,
target = "rules"))
Then I can duplicate the stuff output from inspect(rules):
> ruledf = data.frame(
lhs = labels(lhs(rules))$elements,
rhs = labels(rhs(rules))$elements,
rules#quality)
> head(ruledf)
lhs rhs support confidence lift
1 {} {capital-gain=None} 0.9173867 0.9173867 1.0000000
2 {} {capital-loss=None} 0.9532779 0.9532779 1.0000000
3 {hours-per-week=Full-time} {capital-gain=None} 0.5435895 0.9290688 1.0127342
4 {hours-per-week=Full-time} {capital-loss=None} 0.5606650 0.9582531 1.0052191
5 {sex=Male} {capital-gain=None} 0.6050735 0.9051455 0.9866565
6 {sex=Male} {capital-loss=None} 0.6331027 0.9470750 0.9934931
and do stuff like order by decreasing lift:
head(ruledf[order(-ruledf$lift),])
The help for the rules class: http://www.rdocumentation.org/packages/arules/functions/rules-class.html will tell you what you can get from your rules object - I just used that information to build a data frame. If its not exactly what you want, then cook one up using your own recipe!
Run apriori in data Adult
rules <- apriori(Adult, parameter = list(supp = 0.5, conf = 0.9, target =
"rules"))
Inspect LHS, RHS, support, confidence and lift
arules::inspect(rules)
Create a dataframe
df = data.frame(
lhs = labels(lhs(rules)),
rhs = labels(rhs(rules)),
rules#quality)
View top 6 lines in new dataframe
head(df)
This does the trick
rules_dataframe <- as(output, 'data.frame')

Correlating two data sets in R leaving out subjects not in both

Still very new to R and have a question about performing a correlation. I have two data sets that I want to correlate. Let's say I named the sets Data1 and Data2 for simplicity. Most of the subjects are in both sets but there are some subjects that are not. This is a problem as I now have uneven data sets that cannot correlate. How do I tell R to ignore the subjects that are not in both data sets so that I can perform my correlation? I know there is likely a way to have R ignore these subjects in the same command where I ask it to correlate my sets.
Also if I want R to only correlate columns 4:7 using the subject IDs in column 1 would I, for example, use the command cor.test(Data1[1,4:7], Data2[1,4:7])?
Thanks for any help you can provide.
Disclaimer: Have not test because no MWE provided.
Try something like this:
cor.test(subset(x=Data1, subset=ID==1, select=4:7), subset(x=Data2, subset=ID==1, select=4:7))
Try:
data
dat1 <- structure(list(V1 = c(9L, 2L, 5L, 9L, 9L), V2 = c(8L, 4L, 7L,
9L, 6L), V3 = c(4L, 5L, 7L, 7L, 8L), V4 = c(7L, 4L, 6L, 7L, 1L
), V5 = c(9L, 2L, 10L, 7L, 10L), subject = 1:5), .Names = c("V1",
"V2", "V3", "V4", "V5", "subject"), row.names = c(NA, -5L), class = "data.frame")
dat2 <- structure(list(V1 = c(2L, 6L, 5L, 9L, 7L), V2 = c(2L, 10L, 5L,
5L, 6L), V3 = c(3L, 4L, 3L, 8L, 7L), V4 = c(3L, 2L, 10L, 1L,
9L), V5 = c(2L, 4L, 8L, 1L, 6L), subject = c(1, 3, 5, 6, 8)), .Names = c("V1",
"V2", "V3", "V4", "V5", "subject"), row.names = c(NA, -5L), class = "data.frame")
Create an index of subject IDs that are common in both
indx <- intersect(dat1$subject, dat2$subject)
Apply cor.test on the dataset with common subject IDs
cor.test(as.matrix(dat1[dat1$subject %in% indx,3:5]), as.matrix(dat2[dat2$subject %in% indx, 3:5]))

Split a R dataframe by rows containing a keyword

Is there a quick way to split a large data.frame by keywords
so for example if I have the data set below is there a quick way to split the data frame at each occurrence of the source:restaurant line? Another take on the question would be is there a quick way of creating factors for the dataframe based upon a list of cut offs (in this case c(3,7,10)) that would then give me e.g. factors=c(A,A,A,B,B,B,B,C,C,C) that I could use in a split(mylist,factors) formula? Thanks
mylist=structure(list(V1 = structure(c(5L, 3L, 7L, 8L, 6L, 4L, 7L, 2L,
1L, 7L), .Label = c("cider", "claret", "custard", "krispies",
"rhubarb", "shreddies", "source:restaurant", "weetabix"), class = "factor"),
V2 = c(1L, 5L, NA, 9L, 13L, 17L, NA, 21L, 25L, NA), V3 = c(2L,
6L, NA, 10L, 14L, 18L, NA, 22L, 26L, NA), V4 = c(3L, 7L,
NA, 11L, 15L, 19L, NA, 23L, 27L, NA), V5 = c(4L, 8L, NA,
12L, 16L, 20L, NA, 24L, 28L, NA)), .Names = c("V1", "V2",
"V3", "V4", "V5"), class = "data.frame", row.names = c(NA, -10L
))
A very clunky possible solution below but I'm hoping for something a bit more elegant..
temp=NULL
a=which(mylist[,1] == 'source:restaurant')
for(i in seq_along(a)){temp=c(temp,rep(letters[i],(a[i]-length(temp))))}
temp=as.factor(temp)
split(mylist,temp)
The factor:
factor(cumsum(mylist$V1 == "source:restaurant") + 1)
the split:
split(mylist, cumsum(mylist$V1 == "source:restaurant"))
UPDATE: you probably have the restaurant:soure at the end of each group that it marks, to account for this you can use:
factor(cumsum(c(0, head(mylist$V1 == "source:restaurant", -1))) + 1)
split(mylist, cumsum(c(0, head(mylist$V1 == "source:restaurant", -1))))
would be better.

Resources