I have a table that looks like this:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
586 0 0 0 1 0 0 0 1 3 1 0 1 0 0 0 0 0 1 0 2 0 3 0 0 0 4 0 1 2 0
637 0 0 0 0 0 0 2 3 2 2 0 4 0 0 0 0 1 0 0 2 0 1 1 1 0 0 0 0 0 1
989 0 0 1 0 0 0 2 1 0 0 0 2 1 0 0 1 2 1 0 3 0 2 0 1 1 0 1 0 1 0
1081 0 0 0 1 0 0 1 0 1 1 0 0 2 0 0 0 0 0 0 3 0 5 0 0 2 1 0 1 1 1
2922 0 1 1 1 0 0 0 2 1 0 0 0 2 0 0 0 1 1 0 1 0 3 1 1 2 0 0 1 0 1
3032 0 1 0 0 0 0 0 3 0 0 1 0 2 1 0 1 0 1 1 0 0 3 1 1 1 1 0 0 1 1
Numbers 1 to 30 in the first row are my labels, and the columns are my items. I would like to find, for each item, the label with the most counts. E.g. 586 has 4 counts of 26, which is the highest number in that row, so for 586, I would like to assign 26.
I am able to get the maximum value for each row with max(table1[1,])), which gets me the maximum value for first row, but doesn't get me the label it corresponds to, but I don't know how to proceed. All help is appreciated!
dput:
structure(c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L,
0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 2L, 1L, 0L, 0L, 1L, 3L, 1L,
0L, 2L, 3L, 3L, 2L, 0L, 1L, 1L, 0L, 1L, 2L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 4L, 2L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 2L,
2L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 1L, 0L, 1L, 2L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 1L, 2L, 2L, 3L, 3L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 3L, 1L, 2L, 5L, 3L, 3L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L,
0L, 1L, 1L, 0L, 0L, 1L, 2L, 2L, 1L, 4L, 0L, 0L, 1L, 0L, 1L, 0L,
0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 2L, 0L, 1L, 1L, 0L,
1L, 0L, 1L, 0L, 1L, 1L, 1L), .Dim = c(6L, 30L), .Dimnames = structure(list(
c("586", "637", "989", "1081", "2922", "3032"), c("1", "2",
"3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23",
"24", "25", "26", "27", "28", "29", "30")), .Names = c("",
"")))
max.col will give you vector of column numbers which correspond to maximum value for each row.
> max.col(df, tie='first')
[1] 26 12 20 22 22 8
You can use that vector to get column names for each row.
> colnames(df)[max.col(df, tie='first')]
[1] "26" "12" "20" "22" "22" "8"
Perhaps you are looking for which.max. Assuming your matrix is called "temp":
> apply(temp, 1, which.max)
586 637 989 1081 2922 3032
26 12 20 22 22 8
apply with MARGIN = 1 (the second argument) will apply a function by row.
Related
I have a matrix like below that is a hyper graph matrix, I transformed it to the object network , but I dunno how can I transform this matrix in a hypergraph of an object of class network, can you help me? any idea?
mat<-as.matrix(data)
g<- as.network.matrix(mat)
g
E1 E2 E3 E4 E5 E6 E7 E8 E9 E10 E11 E12 E13 E14
EVELYN 1 1 1 1 1 1 0 1 1 0 0 0 0 0
LAURA 1 1 1 0 1 1 1 1 0 0 0 0 0 0
THERESA 0 1 1 1 1 1 1 1 1 0 0 0 0 0
BRENDA 1 0 1 1 1 1 1 1 0 0 0 0 0 0
CHARLOTTE 0 0 1 1 1 0 1 0 0 0 0 0 0 0
FRANCES 0 0 1 0 1 1 0 1 0 0 0 0 0 0
ELEANOR 0 0 0 0 1 1 1 1 0 0 0 0 0 0
PEARL 0 0 0 0 0 1 0 1 1 0 0 0 0 0
RUTH 0 0 0 0 1 0 1 1 1 0 0 0 0 0
VERNE 0 0 0 0 0 0 1 1 1 0 0 1 0 0
MYRA 0 0 0 0 0 0 0 1 1 1 0 1 0 0
KATHERINE 0 0 0 0 0 0 0 1 1 1 0 1 1 1
SYLVIA 0 0 0 0 0 0 1 1 1 1 0 1 1 1
NORA 0 0 0 0 0 1 1 0 1 1 1 1 1 1
HELEN 0 0 0 0 0 0 1 1 0 1 1 1 0 0
DOROTHY 0 0 0 0 0 0 0 1 1 0 0 0 0 0
OLIVIA 0 0 0 0 0 0 0 0 1 0 1 0 0 0
FLORA 0 0 0 0 0 0 0 0 1 0 1 0 0 0
I guess mat is a incidence matrix, and I am not sure if you are looking for something like below if you are using network package
as.matrix(network(t(mat)),matrix.type = "incidence")
Besides, the incidence matrix visualization via igraph can be achieved from the following:
g <- igraph::graph_from_incidence_matrix(mat)
then
plot(g)
gives
DATA
mat <- structure(c(1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L,
1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L), .Dim = c(18L,
14L), .Dimnames = list(c("EVELYN", "LAURA", "THERESA", "BRENDA",
"CHARLOTTE", "FRANCES", "ELEANOR", "PEARL", "RUTH", "VERNE",
"MYRA", "KATHERINE", "SYLVIA", "NORA", "HELEN", "DOROTHY", "OLIVIA",
"FLORA"), c("E1", "E2", "E3", "E4", "E5", "E6", "E7", "E8", "E9",
"E10", "E11", "E12", "E13", "E14")))
I want to create conditional lead/lag variables that would that capture pre and post years of an agreement signed by countries.
More precisely, I want to create the following variables:
a variable that is =1 in the 4 years pre/before the agreement, 0 otherwise
a variable that is =1 5 years pre the agreement and
a variable that is =1 only after 4 years after the ratification
I have a country-year data (please see below for the sample of the data). The X1 indicates whether a country has signed the agreemen (=1), or not (=0).
The variables I want to create (my expected output) are manually done in the sample data below, labeled as X1_pre4, X1_pre5 and X1_post5. The first captures the 4 years (or up to 4 years) before the agreement is signed. The second captures the 5 years before the agreement is signed. And the last variable captures the 5 years after the agreement is signed (it starts the same year as the agreement is signed, but it's fine if it starts after that, too).
I have been suggested to use some sort of "split-operate-unsplit" construct. But I personally think that this is can be done in dplyr, using the mutate command.
Currently, I've been trying to work with this logic:
data$X1_pre4[data$year<="1972" & data$X1=="0" ] <- "1"
But this is not good enough (far away), as I am not sure how to group by country here. Even if I figure this out, it won't do the work as I have over 100 of X's (agreements). I simply need a code that is much smarter.
data <-
structure(list(country = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"),
year = c(1970L, 1971L, 1972L, 1973L, 1974L, 1975L, 1976L,
1977L, 1978L, 1979L, 1980L, 1981L, 1982L, 1983L, 1984L, 1985L,
1986L, 1987L, 1988L, 1970L, 1971L, 1972L, 1973L, 1974L, 1975L,
1976L, 1977L, 1978L, 1979L, 1980L, 1981L, 1982L, 1983L, 1984L,
1985L, 1986L, 1987L, 1988L, 1970L, 1971L, 1972L, 1973L, 1974L,
1975L, 1976L, 1977L, 1978L, 1979L, 1980L, 1981L, 1982L, 1983L,
1984L, 1985L, 1986L, 1987L, 1988L, 1989L, 1990L, 1991L),
X1 = c(0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), X1_pre4 = c(1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L), X1_pre5 = c(1L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X1_post4 = c(0L,
0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-60L))
This would correspond to your logic:
check_pre <- function(idx, k) { pre_vec <- sapply(1:length(idx), function(x) +any(idx[x:(pmin(x + k, length(idx)))] %in% 1)); pre_vec[idx == 1] <- 0; return(pre_vec) }
check_post <- function(idx, k) sapply(1:length(idx), function(x) +any(idx[(pmax(x - k, 1)):x] %in% 1))
df %>%
group_by(country) %>%
mutate(
idx = +( (lag(X1) == 0 & X1 == 1) | row_number() == 1 & X1 == 1),
X1_pre4 = check_pre(idx, 4),
X1_pre5 = check_pre(idx, 5),
X1_post4 = check_post(idx, 4),
idx = NULL
)
Basically we create an index of when the agreement occurred, and then check for the rows before/after this index with custom functions check_pre and check_post.
This is the output:
country year X1 X1_pre4 X1_pre5 X1_post4
1 A 1970 0 1 1 0
2 A 1971 0 1 1 0
3 A 1972 1 0 0 1
4 A 1973 1 0 0 1
5 A 1974 1 0 0 1
6 A 1975 1 0 0 1
7 A 1976 1 0 0 1
8 A 1977 1 0 0 0
9 A 1978 1 0 0 0
10 A 1979 1 0 0 0
11 A 1980 1 0 0 0
12 A 1981 1 0 0 0
13 A 1982 1 0 0 0
14 A 1983 1 0 0 0
15 A 1984 1 0 0 0
16 A 1985 1 0 0 0
17 A 1986 1 0 0 0
18 A 1987 1 0 0 0
19 A 1988 1 0 0 0
20 B 1970 0 0 0 0
21 B 1971 0 0 0 0
22 B 1972 0 0 0 0
23 B 1973 0 0 1 0
24 B 1974 0 1 1 0
25 B 1975 0 1 1 0
26 B 1976 0 1 1 0
27 B 1977 0 1 1 0
28 B 1978 1 0 0 1
29 B 1979 1 0 0 1
30 B 1980 1 0 0 1
31 B 1981 1 0 0 1
32 B 1982 1 0 0 1
33 B 1983 1 0 0 0
34 B 1984 1 0 0 0
35 B 1985 1 0 0 0
36 B 1986 1 0 0 0
37 B 1987 1 0 0 0
38 B 1988 1 0 0 0
39 C 1970 1 0 0 1
40 C 1971 0 0 0 1
41 C 1972 0 0 0 1
42 C 1973 0 0 0 1
43 C 1974 0 0 0 1
44 C 1975 0 0 0 0
45 C 1976 0 0 0 0
46 C 1977 0 0 0 0
47 C 1978 0 0 0 0
48 C 1979 0 0 1 0
49 C 1980 0 1 1 0
50 C 1981 0 1 1 0
51 C 1982 0 1 1 0
52 C 1983 0 1 1 0
53 C 1984 1 0 0 1
54 C 1985 1 0 0 1
55 C 1986 1 0 0 1
56 C 1987 1 0 0 1
57 C 1988 1 0 0 1
58 C 1989 1 0 0 0
59 C 1990 1 0 0 0
60 C 1991 1 0 0 0
It corresponds to your desired output in majority of cases, however from row 39 onwards you don't have it marked as post-agreement - though it occurred in 1970. Either a typo or you'll need to further explain the logic.
I have a bunch of rows (1000s) of data with multiple subjects that I have a pretty basic question for, but am very unsure how to go about answering it. Below is an example of what my data frame looks like.
Subject Time Current.State Transition.State Transition.Event L2 L1 C R2 R1 FoodCup
14 KM6 435 16 15 0 0 0 0 0 0 0
15 KM6 455 15 4 0 0 0 0 0 0 0
16 KM6 648 4 7 3 0 0 1 0 0 0
17 KM6 658 7 14 0 0 0 0 0 0 0
18 KM6 691 14 16 8 0 0 0 0 0 1
19 KM6 698 16 0 0 0 0 0 0 0 0
20 KM6 721 16 15 0 0 0 0 0 0 0
21 KM6 741 15 4 0 0 0 0 0 0 0
22 KM6 758 4 0 0 0 0 0 0 0 1
23 KM6 762 4 0 0 0 0 0 0 0 0
24 KM6 810 4 7 6 0 0 0 0 1 0
25 KM6 814 7 0 0 0 0 0 0 0 0
26 KM6 815 7 0 0 0 0 1 0 0 0
27 KM6 819 7 0 0 0 0 1 0 0 0
28 KM6 820 7 14 0 0 0 0 0 0 0
29 KM6 821 14 0 0 0 0 0 0 0 0
30 KM6 822 14 0 0 0 0 1 0 0 0
31 KM6 824 14 0 0 0 0 0 0 0 0
32 KM6 829 14 0 0 0 0 1 0 0 0
33 KM6 862 14 16 8 0 0 0 0 0 1
34 KM6 863 16 0 0 0 0 0 0 0 1
The task I'm looking to complete is:
How much Time passes between a Transition.State == 7 and a FoodCup == 1, but only for the first FoodCup == 1 since the last Transition.State == 7.
For example, I want to be able to take the time in row 16 (648) because that row has a Transition.State == 7, then take the time (691) from row 18 because it is the first FoodCup == 1 after row 16, and then get a value of that time difference (691-648). But I want to completely ignore row 22, even though it has a FoodCup == 1, since there was no Transition.State == 7 "directly" above it.
The desired output would be a new data.frame with every occurrence of this with its time difference per Subject (not shown here, but there are multiple in the actual dfs)
I hope this makes sense.
Thank you!
Maybe something like the following will do what you want.
fun <- function(DF, want = 7){
st <- which(DF[['Transition.State']] == want)
fc <- which(DF[['FoodCup']] == 1)
i <- findInterval(fc, st)
i <- i[is.finite(i)][1]
DF[['Time']][fc[i]] - DF[['Time']][st[i]]
}
fun(df1)
#[1] 43
fun(df1, 14)
#[1] 33
Data in dput format.
df1 <-
structure(list(Subject = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
.Label = "KM6", class = "factor"), Time = c(435L, 455L, 648L,
658L, 691L, 698L, 721L, 741L,
758L, 762L, 810L, 814L, 815L, 819L, 820L, 821L, 822L, 824L,
829L, 862L, 863L), Current.State = c(16L, 15L, 4L, 7L, 14L,
16L, 16L, 15L, 4L, 4L, 4L, 7L, 7L, 7L, 7L, 14L, 14L, 14L,
14L, 14L, 16L), Transition.State = c(15L, 4L, 7L, 14L, 16L,
0L, 15L, 4L, 0L, 0L, 7L, 0L, 0L, 0L, 14L, 0L, 0L, 0L, 0L,
16L, 0L), Transition.Event = c(0L, 0L, 3L, 0L, 8L, 0L, 0L,
0L, 0L, 0L, 6L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 8L, 0L),
L2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), L1 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), C = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L), R2 = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), R1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), FoodCup = c(0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 1L)), class = "data.frame", row.names = c("14",
"15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25",
"26", "27", "28", "29", "30", "31", "32", "33", "34"))
I'm following the vcd docs where assocstats is called on an xtable call on multiple subsets of a data frame. However, I get NaNs with a specific subset because the expected observations for many cases is 0:
factor.2
factor.1 0 1 2 3 4 5 or more
0 0 12 7 1 0 1
1 0 2 1 1 0 0
2 0 8 2 1 0 0
3 0 5 4 0 0 0
4 0 1 2 2 0 0
5 0 6 8 0 0 0
6 0 5 3 0 0 0
7 0 5 1 0 0 0
8 0 5 4 0 1 0
9 0 1 1 0 1 0
10 0 5 6 0 0 1
temp.table <- structure(c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 12L,
2L, 8L, 5L, 1L, 6L, 5L, 5L, 5L, 1L, 5L, 7L, 1L, 2L, 4L, 2L, 8L,
3L, 1L, 4L, 1L, 6L, 1L, 1L, 1L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L), .Dim = c(11L, 6L), .Dimnames = structure(list(
factor.1 = c("0", "1", "2", "3", "4", "5", "6", "7", "8",
"9", "10"), factor.2 = c("0", "1", "2", "3", "4", "5 or more"
)), .Names = c("factor.1", "factor.2")), class = c("xtabs",
"table"), call = xtabs(data = cases.limited, na.action = na.omit))
library(vcd)
assocstats(temp.table)
X^2 df P(> X^2)
Likelihood Ratio 35.004 50 0.94676
Pearson NaN 50 NaN
Phi-Coefficient : NA
Contingency Coeff.: NaN
Cramer's V : NaN
Is there a way to quickly and efficiently avoid including these cases in the analysis without extensive rewriting of some of what assocstats or xtable do? I understand that there is arguably less statistical power, but Cramer's V is already an optimistic estimator, so the results will still be useful to me.
This is a question about the fourthcorner algorithm in R. It's designed to measure the relationship between three different tables: an n x m table (table R) of m environmental variables (columns) at n sites (rows), an n x p table (table L) of p abundances (columns) at n sites (rows), and a p x s table (table Q) of s traits (columns) for p species (rows).
The fourthcorner function is in the package ade4.
All three of my dataframes are binary (0s and 1s denoting the presence or absence of a variable, a species at a site, or a trait, respectively). I've tried using "yes" and "no" instead of 0s and 1s without success.
Here are some example matrices in the format I'm using:
tabQ
Trait1 Trait2 Trait3 Trait4
Sp1 0 1 0 0
Sp2 0 1 0 0
Sp3 1 0 1 0
Sp4 1 0 1 0
Sp5 0 1 0 0
Sp6 0 1 0 0
Sp7 0 0 0 1
Sp8 0 0 0 1
tabR
EnV1 EnV2 EnV3 EnV4
Site1 1 1 1 1
Site2 1 1 0 1
Site3 0 1 0 1
Site4 1 1 1 1
Site5 1 1 0 1
Site6 0 1 0 0
Site7 0 1 0 1
Site8 0 1 0 1
Site9 1 1 1 1
Site10 1 1 0 1
Site11 1 1 1 1
Site12 0 1 0 0
Site13 1 1 0 1
Site14 1 1 0 1
Site15 0 1 0 1
Site16 1 1 0 1
Site17 0 1 0 1
Site18 1 1 1 1
Site19 1 1 0 1
Site20 1 1 0 1
tabL
Sp1 Sp2 Sp3 Sp4 Sp5 Sp6 Sp7 Sp8
Site1 1 1 0 0 0 0 0 0
Site2 1 1 0 0 0 0 0 0
Site3 1 1 0 0 0 0 0 0
Site4 1 0 0 0 0 0 0 1
Site5 1 1 0 0 0 0 0 0
Site6 1 0 0 0 1 0 0 0
Site7 1 0 0 0 0 0 0 0
Site8 0 0 0 0 1 0 0 0
Site9 1 0 0 0 0 0 0 0
Site10 1 1 0 0 0 0 0 0
Site11 0 0 1 1 0 0 0 0
Site12 0 0 0 0 0 1 0 0
Site13 1 0 0 0 0 0 0 0
Site14 0 0 0 0 1 0 0 0
Site15 1 1 0 0 0 0 0 0
Site16 1 1 0 0 0 0 0 0
Site17 1 0 0 0 0 0 0 0
Site18 0 0 1 0 0 0 0 0
Site19 1 0 0 0 0 0 0 0
Site20 1 1 0 0 0 0 1 0
I read these dataframes into R from text files, and I specify that the first column is row names.
This is the error I get when I try to use the fourthcorner function on my matrices:
fourth1=fourthcorner(tabR,tabL,tabQ,nrepet=1)
Error in apply(sim, 2, function(x) length(na.omit(x))) :
dim(X) must have a positive length
I don't understand where the problem lies, is it a formatting issue? If so, should I reformat one of the matrices? Which one is causing the trouble? Or can I not use binary traits and environmental variables for this function? In other words, can I solve this problem by changing a piece of code, or is it impossible to use this function for this question?
As an additional tidbit of information, I did email the author of the function, but unfortunately I did not understand his response fully, possibly because my R skills still leave much to be desired. Here is his response if it is helpful:
Q could contain quantitative or qualitative traits. In R, qualitative traits should be coded as factors to obtain adapted statistics (i.e. chi2 or eta2). If you code qualitative variables as dummy variables, they would be considered as quantitative.
Thank you very much to any and all insight.
I noted that your example fails only nrepet is equal to one, so if you can use any other positive number you should be fine.
However, if you do need nrepet=1, you should contact with the author of ade4 and ask to him/her to fix the fourthcorner function code. I traced back the error and found that fourthcorner calls as.krandtest with sim = res$tabD[-1,] where res$tabD is a matrix with nrepet+1 rows. When nrepet=1 and you remove one row from a two-row matrix, R automatically converts the resulting one-row matrix into a vector, but as.krandtest function expects sim to be a matrix and thus raises the error.
Here is your input data just in case somebody else would like to answer your question:
tabR
structure(list(EnV1 = c(1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,
1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L), EnV2 = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), EnV3 = c(1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), EnV4 = c(1L, 1L, 1L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("EnV1",
"EnV2", "EnV3", "EnV4"), row.names = c("Site1", "Site2", "Site3",
"Site4", "Site5", "Site6", "Site7", "Site8", "Site9", "Site10",
"Site11", "Site12", "Site13", "Site14", "Site15", "Site16", "Site17",
"Site18", "Site19", "Site20"), class = "data.frame")
tabL
structure(list(Sp1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
0L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L), Sp2 = c(1L, 1L, 1L,
0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L,
1L), Sp3 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), Sp4 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Sp5 = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L), Sp6 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
), Sp7 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), Sp8 = c(0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L)), .Names = c("Sp1", "Sp2", "Sp3", "Sp4", "Sp5", "Sp6",
"Sp7", "Sp8"), row.names = c("Site1", "Site2", "Site3", "Site4",
"Site5", "Site6", "Site7", "Site8", "Site9", "Site10", "Site11",
"Site12", "Site13", "Site14", "Site15", "Site16", "Site17", "Site18",
"Site19", "Site20"), class = "data.frame")
tabQ
structure(list(Trait1 = c(0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L), Trait2 = c(1L,
1L, 0L, 0L, 1L, 1L, 0L, 0L), Trait3 = c(0L, 0L, 1L, 1L, 0L, 0L,
0L, 0L), Trait4 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L)), .Names = c("Trait1",
"Trait2", "Trait3", "Trait4"), row.names = c("Sp1", "Sp2", "Sp3",
"Sp4", "Sp5", "Sp6", "Sp7", "Sp8"), class = "data.frame")