retrieving names(table) following subset where only 1 observation - r

My issue is that when I try to retrieve names(myresults) after subsetting a table I get null when the returned subset has only 1 result. Rather than returning a character vector of row names r returns an integer (in this case of 1).
Here is a table
head(tbl)
1 2 3 4 5 6
afford 0 1 0 0 0 0
app 0 0 0 1 0 0
back 0 1 0 0 0 0
cancel 0 0 0 0 1 0
charg 0 0 0 0 0 1
download 0 0 0 0 0 1
I have been subsetting the table within a loop to return a table for each group. If a term belongs to a group it has a value of 1:
for (i in 1:ncol(tbl)) {
t <- tbl[which(tbl[,i]==1),i]
nam <- names(t)
df <- as.data.frame(nam)
names(df) <- paste0("Cluster ",i)
print(kable(df))
}
This loop seems to work OK when there are more than one instance of a term returned by which(). But the group 4, which has only 1 term "app" gives me issues. Here's an example on group 3, which works as expected then on group 4, which does not:
> t <- tbl[which(tbl[,4]==1),4] # only 1 observation meets this criteria
> t
[1] 1
> t <- tbl[which(tbl[,3]==1),3] # 3 observations meet this criteria
> t
aword cat dog
1 1 1
So I can get names(t) for tbl[,3] where it has 3 returned instances but not for tbl[,4] which only has 1.
> t <- fintab[which(fintab[,4]==1),4]
> names(t)
NULL # expected "app"
> t <- fintab[which(fintab[,4]==1),4]
> names(t)
[1] "aword" "cat" "dog"
How can I get names(t) when I have only 1 instance returned like in the example?
Some further context following comment below:
> str(tbl)
'table' int [1:33, 1:6] 0 0 0 0 0 0 0 0 0 0 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:33] "aword" "app" "cat" "dog" ...
..$ : chr [1:6] "1" "2" "3" "4" ...
>
and
> dput(tbl)
structure(c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L), .Dim = c(33L, 6L), .Dimnames = structure(list(
c("aword", "app", "back", "cancel", "charg", "download",
"enough", "expens", "get", "great", "just", "like", "love",
"cat", "dog", "bla", "month", "much", "need",
"never", "phone", "pleas", "blabla", "realli", "term", "sign",
"thank", "time", "triangle", "use", "want", "will", "work"), c("1",
"2", "3", "4", "5", "6")), .Names = c("", "")), class = "table")

As we are subsetting a single column, we get the logical index (tbl[,4] ==1 - no need to wrap with which unless there are NAs. In that case, the which remove those NAs) and use that to subset the column vector.
tbl[,4][tbl[,4]==1]
# app
# 1
tbl[,3][tbl[,3]==1]
# cat blabla time
# 1 1 1

Related

Add new columns with defined values in R

I have a data.table named dmat. I want to add each character of missing_snps to dmat as new column and assign all rows as zero. The output remains in the same class as it was.
I would appreciate any suggestion.
dmat <- structure(list(`1:27950613:G:A` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27950883:CTA:C` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27952180:A:G` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27953106:A:G` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27953374:G:T` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27953514:T:TA` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27953608:T:C` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27954027:G:A` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27954415:T:C` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27962685:T:C` = c(0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 0L)), row.names = c(NA, -10L), class = c("tbl_df", "tbl",
"data.frame"))
missing_snps <- c("1:169858888:G:A", "1:16985867657:T:A", "1:132862874:G:A")
dmat[,c("1:169858888:G:A", "1:16985867657:T:A", "1:132862874:G:A")] <- 0
or dmat[, missing_snps] <- 0
Using data.table,
dmat <- setDT(dmat)
missing_snps <- c("1:169858888:G:A", "1:16985867657:T:A", "1:132862874:G:A")
dmat[,(missing_snps ):=0]
Output
> dmat[,..missing_snps ]
1:169858888:G:A 1:16985867657:T:A 1:132862874:G:A
1: 0 0 0
2: 0 0 0
3: 0 0 0
4: 0 0 0
5: 0 0 0
6: 0 0 0
7: 0 0 0
8: 0 0 0
9: 0 0 0
10: 0 0 0
The columns you want to mutate has been added.

mlogit : using varying alternatives for mlogit in R

I am trying to use varying alternatives for each person. However not able to get it working. If I make the alternatives same for each person, it works fine. How to make it varying and work.
Data :
> dput( df1 )
structure(list(Choice = c(1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L,
0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L,
1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L), A = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 1L, 0L, 0L, -1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), B = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 1L, 0L, 0L, -1L, 0L,
0L), C = c(1L, 0L, 0L, 0L, -1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 0L,
1L, 0L, 0L, -1L, 0L, 0L, 0L, 0L, 0L, 0L), D = c(0L, 1L, 0L, 0L,
0L, -1L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 1L, 0L, 0L, -1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), E = c(0L, 0L, 1L, 0L, 0L, 0L, -1L, 0L, 0L, 0L, 1L,
0L, 0L, -1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L), F = c(0L, 0L,
0L, 1L, 0L, 0L, 0L, -1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, -1L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, -1L), Alternative = c(1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L)), row.names = c(NA, -38L), class = "data.frame")
Code :
model = mlogit( Choice ~ B + C + D + E + F | 0, data = df1,
alt.levels = unique( df1$Alternative ),
shape = "long")
Error
Error in dfidx::dfidx(data = data, dfa$idx, drop.index = dfa$drop.index, :
the data must be balanced in order to use the levels argument
You need to provide mlogit with an explicit ID variable denoting which participant made the choice. It can't infer them from the data.frame you've provided.
I'm assuming in your reproducible example that the alternatives in rows running sequentially from [1 - 4] or [1 - 3] represent the choice sets presented to a unique individual. If so, then you can fit a model like so:
library(mlogit)
# Explicitly create an ID variable
df1$ID <- rep(1:12, times = c(rep(4, 2), rep(3, 10)))
#Convert to dfidx data
dfx1 <- mlogit.data(df1,
shape = "long",
choice = "Choice",
id.var = "ID")
# Fit a model
m0 <- mlogit(Choice ~ B + C + D + E + F | 0,
data = dfx1)

Export ftable factors to html

I have a table created from ftable()
structure(c(1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 2L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L), .Dim = c(12L, 7L), class = "ftable", row.vars = list(
ï..petal_size = c("large ", "small", "small "), stem_length = c("long",
"long ", "short", "short ")), col.vars = list(flow_color = c("blue",
"green", "indigo ", "orange", "red ", "violet", "yellow")))
I would like to export it using htmlTable, but when I use htmlTableon this i get this result with no factors and just numbers like in the picture here
How do I recover the factor names for the htmltable? Please note the final output should have the same number of rows and columns as the picture's output, but it needs to have the factor names on the rows and columns.
I will convert it first to data.frame and the add the necessary tweaks to obtain the desired output:
tableToHtml <-structure(c(1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 2L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L), .Dim = c(12L, 7L), class = "ftable", row.vars = list(
ï..petal_size = c("large ", "small", "small "), stem_length = c("long",
"long ", "short", "short ")), col.vars = list(flow_color = c("blue",
"green", "indigo ", "orange", "red ", "violet", "yellow")))
library(htmlTable)
htmlTable(as.data.frame(tableToHtml),rnames=F, header=rep("", length(colnames(as.data.frame(tableToHtml)))))

Why does metaMDS() produce a horizontal distribution of our data?

We have a species presence table (so binary: 1=present, 0=absent). When using metaMDS of the vegan package, it produces a horizontal distribution of our data when plotted, instead of clusters.
We tried using different distance methods (Euclidean, Bray, Jaccard), but they all seem to produce the same plot.
myfungi.all looks like this:
structure(list(Sample = 1:12, Habitat = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Dune", "Forest"
), class = "factor"), OTU88 = c(0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L), OTU28 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), OTU165 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU178 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L), OTU97 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L
), OTU39 = c(0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L),
OTU104 = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L
), OTU95 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L,
0L), OTU90 = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU119 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU451 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L), OTU98 = c(1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU45 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L,
1L), OTU2 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L,
1L), OTU24 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), OTU169 = c(0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU29 = c(1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU85 = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L), OTU140 = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L,
0L), OTU42 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L), OTU70 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L), OTU25 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU34 = c(1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
1L), OTU181 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU201 = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU17 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU1146 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L), OTU14 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L,
1L, 1L), OTU72 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L,
0L, 0L), OTU13 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
1L, 1L), OTU20 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L), OTU63 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU170 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU262 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU48 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU6 = c(0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L,
0L, 0L), OTU3 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L), OTU31 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU73 = c(1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,
0L, 0L), OTU32 = c(0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), OTU37 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU196 = c(0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), OTU5 = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), OTU11 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
0L, 1L), OTU16 = c(0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), OTU41 = c(0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), OTU71 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU109 = c(0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), OTU233 = c(0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L)), class = "data.frame", row.names = c(NA, -12L))
Our script looks like this:
myfungi.all = read.csv("soil_fungi.csv",header=T)
myfungi = myfungi.all[,c(3:51)]
myfungi.nmds.bc <- metaMDS(myfungi, distance = "bray", k = 2, binary = TRUE)
plot(myfungi.nmds.bc, type="t", main=paste("NMDS/Bray-Curtis -?? Stress =", round(myfungi.nmds.bc$stress,10)))
Does anyone have suggestions as what seems to be the problem?
At the moment our plot looks like this:
The solution you reported gives a perfect fit (stress nearly 0), and also gives a warning because of this dubious stress. The solution effectively puts your sampling units into two points so that you have absolutely dichotomous data. As Ben Bolker demonstrated, Principal Coordinates Analysis, PCoA (which you also can perform with stats::cmdscale, vegan::wcmdscale or vegan::dbrda) still has points in two major cluster, but spreads points within these clusters. PCoA is a linear method, but NMDS is non-linear and therefore often needs more data. It seems that in this case the weak ties (read the documentation ?monoMDS or Kruskal's papers cited in that documentation) is the stage that puts most demand on the data, and setting weakties = FALSE will prevent collapsing non-identical observations into two points:
m3 <- metaMDS(myfungi, weakties = FALSE)
m3 # stress 0.04124
stressplot(m3) # compare this to your result stressplot(myfungi.nmds.bc)
plot(m3)
The default monoMDS with weakties = TRUE (like Kruskal recommended) will consider the dichotomy of two groups as the only important non-linear difference, but with weakties = FALSE the solutions cannot proceed to zero stress. You still have a dichotomy, but with scatter.
Best guess is that you simply don't have enough data to distinguish two separate environmental axes: when I run your code I get
Warning message: In metaMDS(myfungi[, -(1:2)], distance = "bray", k = 2, binary = TRUE) : stress is (nearly) zero: you may have insufficient data
Out of your 53 species, only 35 are informative (the others appear either at none or at all of the sites):
m2 <- myfungi[,apply(myfungi,2,var)>0]
ncol(m2) ## 35
vv <- function(x) (image(Matrix(as.matrix(x))))
How many distinct distribution patterns are there?
nrow(unique(t(m2))) ## 27
You could try PCoA instead:
library(ape)
biplot(pcoa(vegdist(m2,"bray"))
As Jari Oksanen points out, you could also do this with cmdscale() in base R:
plot(cmdscale(vegdist(mm,"bray")),
col=as.numeric(myfungi$Habitat))

How do you calculate the average rating per genre

I have a file with 30 columns. These include userid, itemid, moviename, rating, date and the rest are to classify genres a movie belongs to. The genre categories are column names with binary values in the rows. If a movie belongs to a genre, it has a 1 under the appropriate column and 0 otherwise. I want to calculate the average rating per genre and want to know if there is a shorter process available?
I have currently tried filtering the data by selecting each genre where the value is '1' and then calculating the average rating. But I have almost 24 genres and doing it in this way is inefficient i think. Another way I have tried is to loop through the genre columns and again filtering each genre where value is '1' but loops consume alot of time and when the data is set is large(more than 100K rows), R can play up sometimes as I have noticed.
I want to ask if there is another way which avoids a loop like melt,dcast or another method that can get the same job done?
I am providing the dput of my dataset.
dput(data)
structure(list(user_id = c(10L, 890L, 867L, 5L, 320L, 630L, 151L,
699L, 21L, 450L, 179L, 135L, 314L, 487L, 735L, 823L, 169L, 889L,
846L), item_id = c(447L, 660L, 191L, 441L, 1052L, 568L, 414L,
1061L, 872L, 33L, 302L, 581L, 568L, 280L, 181L, 503L, 498L, 207L,
497L), Movie_title = structure(c(6L, 11L, 2L, 3L, 9L, 17L, 15L,
10L, 14L, 8L, 13L, 12L, 17L, 18L, 16L, 5L, 1L, 7L, 4L), .Label = c("African Queen, The (1951)",
"Amadeus (1984)", "Amityville Horror, The (1979)", "Bringing Up Baby (1938)",
"Candidate, The (1972)", "Carrie (1976)", "Cyrano de Bergerac (1990)",
"Desperado (1995)", "Dracula: Dead and Loving It (1995)", "Evening Star, The (1996)",
"Fried Green Tomatoes (1991)", "Kalifornia (1993)", "L.A. Confidential (1997)",
"Love Jones (1997)", "My Favorite Year (1982)", "Return of the Jedi (1983)",
"Speed (1994)", "Up Close and Personal (1996)"), class = "factor"),
Rating = c(4L, 2L, 5L, 1L, 2L, 4L, 5L, 3L, 2L, 5L, 4L, 4L,
5L, 5L, 4L, 5L, 3L, 3L, 5L), Date = structure(c(7L, 15L,
12L, 4L, 1L, 2L, 9L, 8L, 19L, 14L, 18L, 10L, 6L, 16L, 5L,
11L, 17L, 13L, 3L), .Label = c("1/14/1998", "1/25/1998",
"1/5/1998", "10/1/1997", "10/13/1997", "10/26/1997", "10/27/1997",
"11/10/1997", "11/15/1997", "11/18/1997", "11/2/1997", "11/21/1997",
"11/22/1997", "12/18/1997", "12/24/1997", "12/30/1997", "3/31/1998",
"4/10/1998", "9/22/1997"), class = "factor"), unknown = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Action = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L), Adventure = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
1L, 0L, 0L), Animation = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Children = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Comedy = c(0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), Crime = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), Documentary = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Drama = c(0L,
1L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 1L, 0L), Fantasy = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Film.Noir = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Horror = c(1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Musical = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Mystery = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Romance = c(0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L,
1L, 1L, 0L), Sci.Fi = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), Thriller = c(0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L,
0L, 0L, 0L), War = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L), Western = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Short = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), History = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Biography = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Sport = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Family = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("user_id",
"item_id", "Movie_title", "Rating", "Date", "unknown", "Action",
"Adventure", "Animation", "Children", "Comedy", "Crime", "Documentary",
"Drama", "Fantasy", "Film.Noir", "Horror", "Musical", "Mystery",
"Romance", "Sci.Fi", "Thriller", "War", "Western", "Short", "History",
"Biography", "Sport", "Family"), class = "data.frame", row.names = c(NA,
-19L))
This is a good use case for dplyr and tidyr:
library(dplyr)
library(tidyr)
dat %>% gather(genre, value, unknown:Family) %>% filter(value == 1) %>%
group_by(genre) %>% summarize(average = mean(Rating))
This code:
gathers each of the movie/genre pairs into a separate row (there will be multiple rows for each movie)
filters for only the cases when a movie belongs to a genre
groups by genre, and summarizes within each to find the average rating (you could perform other operations like the median or standard deviation as well)
the old-fashion way also works:
genres <- c('Action','Adventure','Animation')
means <- numeric(length(genres))
names(means) <- genres
for(g in genres)
meanRatings[g] <- mean(myData$Rating[mydata[,g]==1])
means

Resources