I have data as follows:
dat <- structure(list(rn = c("A", "B",
"C", "D", "E",
"F", "G", "H",
"I", "J", "K",
"L", "M", "N"
), `0` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), `1` = c(0L, 0L, 0L, 0L, 0L, 0L, 569L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), `2` = c(0L, 0L, 0L, 238L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `3` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1146L, 0L,
0L, 0L, 0L, 0L), `4` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 337L, 0L, 0L), `5` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 631L), `6` = c(0L, 0L, 0L, 0L, 156L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `7` = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 298L, 0L, 0L, 0L), `8` = c(0L, 0L, 0L, 0L,
0L, 456L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `9` = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 927L, 0L, 0L, 0L, 0L, 0L, 0L), `10` = c(436L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `11` = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 657L, 0L, 0L, 0L, 0L), `12` = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1771L, 0L), `13` = c(0L,
0L, 283L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `14` = c(0L,
297L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), class = c("data.table",
"data.frame"), row.names = c(NA, -14L))
rn 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
1: A 0 0 0 0 0 0 0 0 0 0 436 0 0 0 0
2: B 0 0 0 0 0 0 0 0 0 0 0 0 0 0 297
3: C 0 0 0 0 0 0 0 0 0 0 0 0 0 283 0
4: D 0 0 238 0 0 0 0 0 0 0 0 0 0 0 0
5: E 0 0 0 0 0 0 156 0 0 0 0 0 0 0 0
6: F 0 0 0 0 0 0 0 0 456 0 0 0 0 0 0
7: G 0 569 0 0 0 0 0 0 0 0 0 0 0 0 0
8: H 0 0 0 0 0 0 0 0 0 927 0 0 0 0 0
9: I 0 0 0 1146 0 0 0 0 0 0 0 0 0 0 0
10: J 0 0 0 0 0 0 0 0 0 0 0 657 0 0 0
11: K 0 0 0 0 0 0 0 298 0 0 0 0 0 0 0
12: L 0 0 0 0 337 0 0 0 0 0 0 0 0 0 0
13: M 0 0 0 0 0 0 0 0 0 0 0 0 1771 0 0
14: N 0 0 0 0 0 631 0 0 0 0 0 0 0 0 0
I want to create a column with the column name of the column in which there is a value greater than zero.
Desired output:
dat <- structure(list(rn = c("A", "B",
"C", "D", "E",
"F", "G", "H",
"I", "J", "K",
"L", "M", "N"
), NR = c(10, 14, 13, 2, 6, 8, 1, 9, 3, 11, 7, 4, 12,
5)), class = c("data.table",
"data.frame"), row.names = c(NA, -14L))
rn NR
1: A 10
2: B 14
3: C 13
4: D 2
5: E 6
6: F 8
7: G 1
8: H 9
9: I 3
10: J 11
11: K 7
12: L 4
13: M 12
14: N 5
Easier is with max.col from base R
library(data.table)
dat[, .(rn, NR = max.col(.SD[,-1, with = FALSE] > 0, "first")-1)]
-output
rn NR
<char> <num>
1: A 10
2: B 14
3: C 13
4: D 2
5: E 6
6: F 8
7: G 1
8: H 9
9: I 3
10: J 11
11: K 7
12: L 4
13: M 12
14: N 5
Or another option is apply from base R
apply(dat[, -1], 1, \(x) which(x > 0)[1])-1
[1] 10 14 13 2 6 8 1 9 3 11 7 4 12 5
Related
In R, I've created 25x25 matrices of values of 1 and 0 and I need to find the height between the first occurrence of 1 in the matrix and the last occurrence of 1 in the matrix.
Heres an example of a matrix of the letter a, where each 1 represents a black pixel and each 0 represents a white pixel:
a <- read.csv(csv_files[1])
a
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
7 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
8 0 0 0 0 0 1 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0
9 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
10 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
11 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
12 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
13 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
14 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
15 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
16 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
17 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0
18 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0
19 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0
20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
21 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
22 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
23 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
24 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
My idea is the find the row number of the last occurrence of 1 and the row number of the first occurrence of 1 and take them away from eachother, which will give me the height of the symbol.
In this case it would be 19 - 6 = 13, so the height is 13.
For context, I drew images of different letters and symbols on GIMP, and the imported them into R and saved them in a matrix as a CSV file.
Try the code below
> diff(range(which(a == 1, arr.ind = TRUE)[, "row"]))
[1] 13
Data
> dput(a)
structure(list(V1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
V2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), V3 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), V4 = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L), V5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), V6 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
), V7 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), V8 = c(0L,
0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), V9 = c(0L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L), V10 = c(0L, 0L, 0L, 0L, 0L, 1L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L), V11 = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), V12 = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L
), V13 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), V14 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), V15 = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), V16 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L), V17 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L), V18 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), V19 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
V20 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), V21 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), V22 = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), V23 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), V24 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), V25 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L)), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25"))
I have a dataframe 522x329 with sites for columns showing the sum of observed species at that site for that particular week in each year.
I want to return a dataframe for each site that responds to each week within the year with 0 values and the same for those with values > 1.
For example, here is a reproducible code + glimpse of the dataset:
structure(list(year = c(2010, 2011, 2012, 2013, 2014, 2015),
week = c(1, 1, 1, 1, 1, 1), `IP20 0HR` = c(0L, 0L, 0L, 0L,
0L, 0L), `IP20 9LR` = c(0L, 0L, 0L, 0L, 0L, 0L), `IP20 9PJ` = c(0L,
0L, 0L, 0L, 0L, 0L), `IP20 9QE` = c(0L, 0L, 0L, 0L, 0L, 0L
), `IP21 4NT` = c(0L, 0L, 0L, 0L, 0L, 0L), `IP21 4NZ` = c(0L,
0L, 0L, 0L, 0L, 0L), `IP21 4PT` = c(0L, 0L, 0L, 0L, 0L, 0L
), `IP21 4TR` = c(12L, 0L, 0L, 0L, 0L, 0L), `IP22 2AP` = c(0L,
0L, 0L, 0L, 0L, 0L), `IP22 2BP` = c(0L, 0L, 0L, 0L, 0L, 0L
), `IP22 2DZ` = c(0L, 0L, 0L, 0L, 0L, 9L), `IP22 2JG` = c(0L,
0L, 0L, 0L, 0L, 0L), `IP22 4BD` = c(9L, 7L, 0L, 7L, 3L, 0L
), `IP22 4BE` = c(7L, 8L, 9L, 7L, 8L, 8L), `IP22 4PH` = c(0L,
0L, 0L, 0L, 0L, 0L), `IP22 4YW` = c(6L, 5L, 7L, 5L, 3L, 7L
), `IP22 5SB` = c(15L, 25L, 25L, 22L, 17L, 16L), `IP22 5SR` = c(0L,
0L, 0L, 0L, 0L, 0L), `IP22 5SX` = c(0L, 9L, 9L, 12L, 0L,
0L), `IP22 5TY` = c(0L, 0L, 0L, 0L, 0L, 0L), `IP24 1JF` = c(8L,
4L, 2L, 0L, 6L, 6L), `IP24 1LB` = c(7L, 0L, 0L, 0L, 0L, 0L
), `IP24 1PN` = c(0L, 0L, 0L, 0L, 0L, 0L), `IP24 1QR` = c(0L,
0L, 0L, 0L, 0L, 0L), `IP24 1RA` = c(0L, 0L, 0L, 0L, 0L, 0L
), `IP24 1UE` = c(0L, 0L, 0L, 0L, 0L, 0L), `IP24 2AD` = c(14L,
8L, 2L, 0L, 0L, 0L), `IP24 2JZ` = c(5L, 0L, 5L, 0L, 0L, 0L
), `IP24 2LD` = c(0L, 0L, 0L, 0L, 0L, 0L), `IP24 2LW` = c(0L,
0L, 0L, 0L, 0L, 0L), `IP24 2PU` = c(0L, 0L, 0L, 0L, 0L, 0L
), `IP24 2TD` = c(0L, 0L, 0L, 0L, 0L, 0L), `IP24 2TQ` = c(0L,
0L, 0L, 0L, 0L, 0L), `IP24 2YR` = c(0L, 0L, 0L, 0L, 0L, 10L
), `IP24 2YW` = c(10L, 11L, 11L, 0L, 0L, 0L), `IP24 2ZA` = c(0L,
0L, 0L, 0L, 0L, 0L), `IP24 3EP` = c(12L, 14L, 0L, 0L, 0L,
0L), `IP24 3HG` = c(0L, 10L, 8L, 7L, 4L, 9L)), row.names = c(NA,
6L), class = "data.frame")
year week IP20 0HR IP20 9LR IP20 9PJ IP20 9QE IP21 4NT IP21 4NZ IP21 4PT IP21 4TR IP22 2AP IP22 2BP IP22 2DZ IP22 2JG
1 2010 1 0 0 0 0 0 0 0 12 0 0 0 0
2 2011 1 0 0 0 0 0 0 0 0 0 0 0 0
3 2012 1 0 0 0 0 0 0 0 0 0 0 0 0
4 2013 1 0 0 0 0 0 0 0 0 0 0 0 0
5 2014 1 0 0 0 0 0 0 0 0 0 0 0 0
6 2015 1 0 0 0 0 0 0 0 0 0 0 9 0
IP22 4BD IP22 4BE IP22 4PH IP22 4YW IP22 5SB IP22 5SR IP22 5SX IP22 5TY IP24 1JF IP24 1LB IP24 1PN IP24 1QR IP24 1RA IP24 1UE
1 9 7 0 6 15 0 0 0 8 7 0 0 0 0
2 7 8 0 5 25 0 9 0 4 0 0 0 0 0
3 0 9 0 7 25 0 9 0 2 0 0 0 0 0
4 7 7 0 5 22 0 12 0 0 0 0 0 0 0
5 3 8 0 3 17 0 0 0 6 0 0 0 0 0
6 0 8 0 7 16 0 0 0 6 0 0 0 0 0
IP24 2AD IP24 2JZ IP24 2LD IP24 2LW IP24 2PU IP24 2TD IP24 2TQ IP24 2YR IP24 2YW IP24 2ZA IP24 3EP IP24 3HG
1 14 5 0 0 0 0 0 0 10 0 12 0
2 8 0 0 0 0 0 0 0 11 0 14 10
3 2 5 0 0 0 0 0 0 11 0 0 8
4 0 0 0 0 0 0 0 0 0 0 0 7
5 0 0 0 0 0 0 0 0 0 0 0 4
6 0 0 0 0 0 0 0 10 0 0 0 9
I want an output showing something like:
df1
year week IP20 0HR IP20 9LR IP20 9PJ ...
2010 33 3 2 1 ...
2011 2 1 7 1 ...
2012 3 2 3 1 ...
2013 4 2 1 2 ...
.
.
2019 1 1 1 1 ...
&
df2
year week IP20 0HR IP20 9LR IP20 9PJ ...
2010 1 0 0 0 ...
2010 5 0 0 0 ...
2011 11 0 0 0 ...
.
.
.
2019 3 0 0 0 ...
I have tried:
pat <- data.frame(method_three[, 3:379] == 0)
which only returns true or false values, not like above, and I want all the values not 0 not in the dataframe, and those greater than 0 in a separate dataframe.
EXTRA:
Then separetly embed the columns into a list so I could call each column individually to print out their values
I am trying to convert the raw data below to an adjacent matrix by assigning the value on the column "s_chloramphenicol" in preparation for a network analysis.
df <- structure(list(studyid0 = c(1L, 5L, 6L, 8L, 9L, 11L, 3052L, 3057L,
3058L, 3058L, 3060L, 3063L, 3064L, 3067L), s_chloramphenicol = c(0L,
0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L)), row.names = c(NA,
-14L), class = "data.frame", .Names = c("studyid0", "s_chloramphenicol"
))
The expected output is
df<-structure(list(`1` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `5` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `6` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `8` = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L,
0L, 0L, 0L), `9` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `11` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `3052` = c(0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L), `3057` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `3058` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `3060` = c(0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L), `3063` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `3064` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `3067` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L)), .Names = c("1", "5", "6", "8", "9", "11", "3052",
"3057", "3058", "3060", "3063", "3064", "3067"), class = "data.frame", row.names = c(1L,
5L, 6L, 8L, 9L, 11L, 3052L, 3057L, 3058L, 3060L, 3063L, 3064L,
3067L))
You can use the function outer:
df2 <- outer(df$s_chloramphenicol, df$s_chloramphenicol)
rownames(df2) <- colnames(df2) <- df$studyid0
df2
Output:
1 5 6 8 9 11 3052 3057 3058 3058 3060 3063 3064 3067
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0 0 0 0 0 0 0
8 0 0 0 1 0 0 1 0 0 0 1 0 0 0
9 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3052 0 0 0 1 0 0 1 0 0 0 1 0 0 0
3057 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3058 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3058 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3060 0 0 0 1 0 0 1 0 0 0 1 0 0 0
3063 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3064 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3067 0 0 0 0 0 0 0 0 0 0 0 0 0 0
I have a big dataset, on film ratings (1 - 10) and would like to get the distribution of the ratings. I also have 0s in the dataset, but those are in reality NAs, but I need them as 0s for later in the project (trying to build a recommendation system).
Sample Data
User.ID 60392452 60502258 60915544 60928336 60930535 60934417 60938455 60959037 60976845
1 26 0 0 0 0 0 0 0 0 0
2 51 0 0 0 0 0 0 0 0 0
3 91 0 0 0 0 0 0 0 0 0
4 99 0 0 0 0 0 0 0 0 0
5 114 0 0 0 0 0 0 0 0 0
6 125 0 0 0 0 0 0 0 0 0
7 165 0 0 0 0 0 0 0 0 9
8 243 0 0 10 0 0 0 0 0 0
Ok, it's not so readable, but User ID column is "26", "51" etc. The movies, which are indicated by codes, are "60392452" etc and are the column headers.
As a start, I used the following code:
table(mod_dataset)
but I got an error message:
Error in table(mod_dataset) :
attempt to make a table with >= 2^31 elements
What is the equivalent of table for "big data" ?
I am really not sure whether this answers your question, but it's a way to table the ratings on a column by column basis.
res <- sapply(mod_dataset[-1], function(x) table(factor(x, levels = 0:10)))
inx <- apply(res, 1, function(x) all(x == 0))
res[!inx, ]
Data in dputformat.
mod_dataset <-
structure(list(User.ID = c(26L, 51L, 91L, 99L, 114L, 125L, 165L,
243L), X60392452 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X60502258 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), X60915544 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 10L), X60928336 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
X60930535 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X60934417 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), X60938455 = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), X60959037 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), X60976845 = c(0L, 0L, 0L, 0L, 0L, 0L, 9L, 0L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8"))
In a data frame, after some calculations, all rows end with a series of 0, as in the (partial) example below:
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
1 -9 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 4 -1 1 -1 0 -1 0 0 0 0 0 0 0 0 0
3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5 -3 0 0 0 0 0 0 0 0 0 0 0 0 0 0
6 -6 0 0 0 0 0 0 0 0 0 0 0 0 0 0
7 4 -4 1 -1 0 -1 0 0 0 0 0 0 0 0 0
8 3 -3 0 0 0 0 0 0 0 0 0 0 0 0 0
9 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11 -3 0 0 0 0 0 0 0 0 0 0 0 0 0 0
But:
- some isolated 0 can occur before the starting of the series of 0, as in lines 2 and 7
- some lines are entirely made of 0, as in lines 4 and 10
I would like to create a new column containing the following information:
"in which column does the series of 0 start?"
From the above example, this new column should contain the numbers:
2, 7, 2, 1, 2, 2, 7, 3, 2, 1, 2
I can't figure out how to do this...
Thanks for any hint.
Use apply to run rle on each row and get the first index where the value is equal to zero and the length is greater than 1 (start of series).
apply(df, 1, function(x) which(rle(x)$values == 0 & rle(x)$lengths > 1)[1] )
# [1] 2 7 2 1 2 2 7 3 2 1 2
Data
df = structure(list(X1 = c(-9L, 4L, 3L, 0L, -3L, -6L, 4L, 3L, 3L,
0L, -3L), X2 = c(0L, -1L, 0L, 0L, 0L, 0L, -4L, -3L, 0L, 0L, 0L
), X3 = c(0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), X4 = c(0L,
-1L, 0L, 0L, 0L, 0L, -1L, 0L, 0L, 0L, 0L), X5 = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X6 = c(0L, -1L, 0L, 0L, 0L,
0L, -1L, 0L, 0L, 0L, 0L), X7 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), X8 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), X9 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X10 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X11 = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X12 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L), X13 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), X14 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), X15 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("X1",
"X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11",
"X12", "X13", "X14", "X15"), class = "data.frame", row.names = c(NA,
-11L))
Here is a easy solution. Probably there are more sophisticated ones but it works. Assuming your matrix is called 'x'
# make new colum and fill with zeros
x[,ncol(x)+1] <- 0
#loop through rows and note first instance of zero in new column
for(i in 1:nrow(x)){
x[i,ncol(x)] <- grep(0, x[i,])[1]
}