table plot in R adjusting the axis - r

I am trying to plot this table
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
X0 4 0 0 0 0 0 0 0 0 0 0 163 0 0 78 0 0 0 1 0 0 0 0 0 1 0 0 153 0
X1 0 0 0 0 0 152 123 0 0 0 0 0 5 0 1 0 0 0 0 119 0 0 0 0 0 0 0 0 0
X2 0 0 55 0 0 1 0 0 185 0 0 0 0 0 0 0 3 0 0 0 2 0 0 0 0 154 0 0 0
X3 1 1 0 0 149 0 0 0 0 0 1 0 0 0 0 4 0 4 126 0 0 0 0 0 108 1 5 0 0
X4 0 0 0 16 0 1 0 108 0 110 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
X5 13 0 0 0 3 1 0 0 0 0 0 0 1 2 0 138 0 123 7 0 0 0 1 0 18 0 93 0 0
X6 0 0 0 0 0 3 0 0 0 0 0 0 0 85 1 0 0 0 1 0 0 104 100 104 0 0 2 0 0
X7 0 93 23 0 0 0 0 0 0 0 71 0 0 0 0 0 55 0 0 0 55 0 0 0 0 0 0 0 103
X8 245 0 0 0 0 0 0 0 0 0 0 0 0 0 73 7 0 12 1 0 0 0 1 1 4 0 48 8 0
X9 0 0 0 153 0 11 1 15 0 18 0 1 194 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0
30
X0 0
X1 0
X2 0
X3 0
X4 162
X5 0
X6 0
X7 0
X8 0
X9 5
but cannot make visually clear what the table is showing, as the y-axis get smudged due to the number of columns in table.
Is it possible somehow to plot this in a way that axis are clear, and nothing get smudged together?
> dput(tablen)
structure(c(4L, 0L, 0L, 1L, 0L, 13L, 0L, 0L, 245L, 0L, 0L, 0L,
0L, 1L, 0L, 0L, 0L, 93L, 0L, 0L, 0L, 0L, 55L, 0L, 0L, 0L, 0L,
23L, 0L, 0L, 0L, 0L, 0L, 0L, 16L, 0L, 0L, 0L, 0L, 153L, 0L, 0L,
0L, 149L, 0L, 3L, 0L, 0L, 0L, 0L, 0L, 152L, 1L, 0L, 1L, 1L, 3L,
0L, 0L, 11L, 0L, 123L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 108L, 0L, 0L, 0L, 0L, 15L, 0L, 0L, 185L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 110L, 0L, 0L, 0L, 0L, 18L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 71L, 0L, 0L, 163L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 5L, 0L, 0L, 3L, 1L, 0L, 0L, 0L, 194L, 0L,
0L, 0L, 0L, 0L, 2L, 85L, 0L, 0L, 0L, 78L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 73L, 0L, 0L, 0L, 0L, 4L, 0L, 138L, 0L, 0L, 7L, 0L, 0L,
0L, 3L, 0L, 0L, 0L, 0L, 55L, 0L, 0L, 0L, 0L, 0L, 4L, 0L, 123L,
0L, 0L, 12L, 0L, 1L, 0L, 0L, 126L, 0L, 7L, 1L, 0L, 1L, 0L, 0L,
119L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 2L, 0L, 0L, 0L,
0L, 55L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 104L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 100L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
104L, 0L, 1L, 0L, 1L, 0L, 0L, 108L, 0L, 18L, 0L, 0L, 4L, 0L,
0L, 0L, 154L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 5L, 0L,
93L, 2L, 0L, 48L, 0L, 153L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 8L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 103L, 0L, 0L, 0L, 0L, 0L, 0L, 162L,
0L, 0L, 0L, 0L, 5L), .Dim = c(10L, 30L), .Dimnames = structure(list(
c("X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9"
), c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11",
"12", "13", "14", "15", "16", "17", "18", "19", "20", "21",
"22", "23", "24", "25", "26", "27", "28", "29", "30")), .Names = c("",
"")), class = "table")

You could rotate the y-axis labels (see help("par") for documentation:
plot(tablen, las = 1)

Related

in R, how can I find the row number of the first occurrence and last occurrence of a value in a Matrix?

In R, I've created 25x25 matrices of values of 1 and 0 and I need to find the height between the first occurrence of 1 in the matrix and the last occurrence of 1 in the matrix.
Heres an example of a matrix of the letter a, where each 1 represents a black pixel and each 0 represents a white pixel:
a <- read.csv(csv_files[1])
a
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
7 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
8 0 0 0 0 0 1 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0
9 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
10 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
11 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
12 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
13 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
14 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
15 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
16 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
17 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0
18 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0
19 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0
20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
21 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
22 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
23 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
24 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
My idea is the find the row number of the last occurrence of 1 and the row number of the first occurrence of 1 and take them away from eachother, which will give me the height of the symbol.
In this case it would be 19 - 6 = 13, so the height is 13.
For context, I drew images of different letters and symbols on GIMP, and the imported them into R and saved them in a matrix as a CSV file.
Try the code below
> diff(range(which(a == 1, arr.ind = TRUE)[, "row"]))
[1] 13
Data
> dput(a)
structure(list(V1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
V2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), V3 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), V4 = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L), V5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), V6 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
), V7 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), V8 = c(0L,
0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), V9 = c(0L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L), V10 = c(0L, 0L, 0L, 0L, 0L, 1L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L), V11 = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), V12 = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L
), V13 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), V14 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), V15 = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), V16 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L), V17 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L), V18 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), V19 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
V20 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), V21 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), V22 = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), V23 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), V24 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), V25 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L)), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25"))

In R, taking differences between values in two rows based on values in other rows, with specific requirements

I have a bunch of rows (1000s) of data with multiple subjects that I have a pretty basic question for, but am very unsure how to go about answering it. Below is an example of what my data frame looks like.
Subject Time Current.State Transition.State Transition.Event L2 L1 C R2 R1 FoodCup
14 KM6 435 16 15 0 0 0 0 0 0 0
15 KM6 455 15 4 0 0 0 0 0 0 0
16 KM6 648 4 7 3 0 0 1 0 0 0
17 KM6 658 7 14 0 0 0 0 0 0 0
18 KM6 691 14 16 8 0 0 0 0 0 1
19 KM6 698 16 0 0 0 0 0 0 0 0
20 KM6 721 16 15 0 0 0 0 0 0 0
21 KM6 741 15 4 0 0 0 0 0 0 0
22 KM6 758 4 0 0 0 0 0 0 0 1
23 KM6 762 4 0 0 0 0 0 0 0 0
24 KM6 810 4 7 6 0 0 0 0 1 0
25 KM6 814 7 0 0 0 0 0 0 0 0
26 KM6 815 7 0 0 0 0 1 0 0 0
27 KM6 819 7 0 0 0 0 1 0 0 0
28 KM6 820 7 14 0 0 0 0 0 0 0
29 KM6 821 14 0 0 0 0 0 0 0 0
30 KM6 822 14 0 0 0 0 1 0 0 0
31 KM6 824 14 0 0 0 0 0 0 0 0
32 KM6 829 14 0 0 0 0 1 0 0 0
33 KM6 862 14 16 8 0 0 0 0 0 1
34 KM6 863 16 0 0 0 0 0 0 0 1
The task I'm looking to complete is:
How much Time passes between a Transition.State == 7 and a FoodCup == 1, but only for the first FoodCup == 1 since the last Transition.State == 7.
For example, I want to be able to take the time in row 16 (648) because that row has a Transition.State == 7, then take the time (691) from row 18 because it is the first FoodCup == 1 after row 16, and then get a value of that time difference (691-648). But I want to completely ignore row 22, even though it has a FoodCup == 1, since there was no Transition.State == 7 "directly" above it.
The desired output would be a new data.frame with every occurrence of this with its time difference per Subject (not shown here, but there are multiple in the actual dfs)
I hope this makes sense.
Thank you!
Maybe something like the following will do what you want.
fun <- function(DF, want = 7){
st <- which(DF[['Transition.State']] == want)
fc <- which(DF[['FoodCup']] == 1)
i <- findInterval(fc, st)
i <- i[is.finite(i)][1]
DF[['Time']][fc[i]] - DF[['Time']][st[i]]
}
fun(df1)
#[1] 43
fun(df1, 14)
#[1] 33
Data in dput format.
df1 <-
structure(list(Subject = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
.Label = "KM6", class = "factor"), Time = c(435L, 455L, 648L,
658L, 691L, 698L, 721L, 741L,
758L, 762L, 810L, 814L, 815L, 819L, 820L, 821L, 822L, 824L,
829L, 862L, 863L), Current.State = c(16L, 15L, 4L, 7L, 14L,
16L, 16L, 15L, 4L, 4L, 4L, 7L, 7L, 7L, 7L, 14L, 14L, 14L,
14L, 14L, 16L), Transition.State = c(15L, 4L, 7L, 14L, 16L,
0L, 15L, 4L, 0L, 0L, 7L, 0L, 0L, 0L, 14L, 0L, 0L, 0L, 0L,
16L, 0L), Transition.Event = c(0L, 0L, 3L, 0L, 8L, 0L, 0L,
0L, 0L, 0L, 6L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 8L, 0L),
L2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), L1 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), C = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L), R2 = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), R1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), FoodCup = c(0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 1L)), class = "data.frame", row.names = c("14",
"15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25",
"26", "27", "28", "29", "30", "31", "32", "33", "34"))

how to prepare an adjacency matrix for network analysis

I am trying to convert the raw data below to an adjacent matrix by assigning the value on the column "s_chloramphenicol" in preparation for a network analysis.
df <- structure(list(studyid0 = c(1L, 5L, 6L, 8L, 9L, 11L, 3052L, 3057L,
3058L, 3058L, 3060L, 3063L, 3064L, 3067L), s_chloramphenicol = c(0L,
0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L)), row.names = c(NA,
-14L), class = "data.frame", .Names = c("studyid0", "s_chloramphenicol"
))
The expected output is
df<-structure(list(`1` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `5` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `6` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `8` = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L,
0L, 0L, 0L), `9` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `11` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `3052` = c(0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L), `3057` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `3058` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `3060` = c(0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L), `3063` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `3064` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `3067` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L)), .Names = c("1", "5", "6", "8", "9", "11", "3052",
"3057", "3058", "3060", "3063", "3064", "3067"), class = "data.frame", row.names = c(1L,
5L, 6L, 8L, 9L, 11L, 3052L, 3057L, 3058L, 3060L, 3063L, 3064L,
3067L))
You can use the function outer:
df2 <- outer(df$s_chloramphenicol, df$s_chloramphenicol)
rownames(df2) <- colnames(df2) <- df$studyid0
df2
Output:
1 5 6 8 9 11 3052 3057 3058 3058 3060 3063 3064 3067
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0 0 0 0 0 0 0
8 0 0 0 1 0 0 1 0 0 0 1 0 0 0
9 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3052 0 0 0 1 0 0 1 0 0 0 1 0 0 0
3057 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3058 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3058 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3060 0 0 0 1 0 0 1 0 0 0 1 0 0 0
3063 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3064 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3067 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Calculating frequency of a number in dataframe

I have a big dataset, on film ratings (1 - 10) and would like to get the distribution of the ratings. I also have 0s in the dataset, but those are in reality NAs, but I need them as 0s for later in the project (trying to build a recommendation system).
Sample Data
User.ID 60392452 60502258 60915544 60928336 60930535 60934417 60938455 60959037 60976845
1 26 0 0 0 0 0 0 0 0 0
2 51 0 0 0 0 0 0 0 0 0
3 91 0 0 0 0 0 0 0 0 0
4 99 0 0 0 0 0 0 0 0 0
5 114 0 0 0 0 0 0 0 0 0
6 125 0 0 0 0 0 0 0 0 0
7 165 0 0 0 0 0 0 0 0 9
8 243 0 0 10 0 0 0 0 0 0
Ok, it's not so readable, but User ID column is "26", "51" etc. The movies, which are indicated by codes, are "60392452" etc and are the column headers.
As a start, I used the following code:
table(mod_dataset)
but I got an error message:
Error in table(mod_dataset) :
attempt to make a table with >= 2^31 elements
What is the equivalent of table for "big data" ?
I am really not sure whether this answers your question, but it's a way to table the ratings on a column by column basis.
res <- sapply(mod_dataset[-1], function(x) table(factor(x, levels = 0:10)))
inx <- apply(res, 1, function(x) all(x == 0))
res[!inx, ]
Data in dputformat.
mod_dataset <-
structure(list(User.ID = c(26L, 51L, 91L, 99L, 114L, 125L, 165L,
243L), X60392452 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X60502258 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), X60915544 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 10L), X60928336 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
X60930535 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X60934417 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), X60938455 = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), X60959037 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), X60976845 = c(0L, 0L, 0L, 0L, 0L, 0L, 9L, 0L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8"))

Find first column with specific property

In a data frame, after some calculations, all rows end with a series of 0, as in the (partial) example below:
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
1 -9 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 4 -1 1 -1 0 -1 0 0 0 0 0 0 0 0 0
3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5 -3 0 0 0 0 0 0 0 0 0 0 0 0 0 0
6 -6 0 0 0 0 0 0 0 0 0 0 0 0 0 0
7 4 -4 1 -1 0 -1 0 0 0 0 0 0 0 0 0
8 3 -3 0 0 0 0 0 0 0 0 0 0 0 0 0
9 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11 -3 0 0 0 0 0 0 0 0 0 0 0 0 0 0
But:
- some isolated 0 can occur before the starting of the series of 0, as in lines 2 and 7
- some lines are entirely made of 0, as in lines 4 and 10
I would like to create a new column containing the following information:
"in which column does the series of 0 start?"
From the above example, this new column should contain the numbers:
2, 7, 2, 1, 2, 2, 7, 3, 2, 1, 2
I can't figure out how to do this...
Thanks for any hint.
Use apply to run rle on each row and get the first index where the value is equal to zero and the length is greater than 1 (start of series).
apply(df, 1, function(x) which(rle(x)$values == 0 & rle(x)$lengths > 1)[1] )
# [1] 2 7 2 1 2 2 7 3 2 1 2
Data
df = structure(list(X1 = c(-9L, 4L, 3L, 0L, -3L, -6L, 4L, 3L, 3L,
0L, -3L), X2 = c(0L, -1L, 0L, 0L, 0L, 0L, -4L, -3L, 0L, 0L, 0L
), X3 = c(0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), X4 = c(0L,
-1L, 0L, 0L, 0L, 0L, -1L, 0L, 0L, 0L, 0L), X5 = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X6 = c(0L, -1L, 0L, 0L, 0L,
0L, -1L, 0L, 0L, 0L, 0L), X7 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), X8 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), X9 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X10 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X11 = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X12 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L), X13 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), X14 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), X15 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("X1",
"X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11",
"X12", "X13", "X14", "X15"), class = "data.frame", row.names = c(NA,
-11L))
Here is a easy solution. Probably there are more sophisticated ones but it works. Assuming your matrix is called 'x'
# make new colum and fill with zeros
x[,ncol(x)+1] <- 0
#loop through rows and note first instance of zero in new column
for(i in 1:nrow(x)){
x[i,ncol(x)] <- grep(0, x[i,])[1]
}

Resources