Turn combinations of columns into some kind of interpretable variable - r

I want to turn combinations of columns into some kind of interpretable variable. There are 3 levels of a factor repeated in three columns, for each id. For all the combinations between the variables I would like to gain a list, and when I have the lsit, I want to know how many times can we find each combination. For example, when q1 and q2 are the same, it should return "A". An then A appear XX times. Anyone with suggestions? Thanks!!
id <- 1:10
set.seed(1)
q1 <- sample(1:3, 10, replace=TRUE)
set.seed(2)
q2 <- sample(1:3, 10, replace=TRUE)
set.seed(2)
q3 <- sample(1:3, 10, replace=TRUE)
df <- data.frame(id,q1,q2,q3)
df
df
id q1 q2 q3
1 1 1 1 1
2 2 2 3 3
3 3 2 2 2
4 4 3 1 1
5 5 1 3 3
6 6 3 3 3
7 7 3 1 1
8 8 2 3 3
9 9 2 2 2
10 10 1 2 2
if df$q1=="1" & df$q2=="1" print A
if df$q1=="1" & df$q2=="2" print B
if df$q1=="1" & df$q2=="3" print C
if df$q1=="2" & df$q2=="3" print D
if df$q1=="2" & df$q2=="2" print E
if df$q1=="3" & df$q2=="3" print F
if df$q2=="1" & df$q2=="1" print G
if df$q2=="1" & df$q2=="2" print H
response <- save(print A, print B, print C and so on....)
length(A)
length(B)
and so on...

I think this should do what you want, using base R. I hope I understood your desired output. I basically combined each pair of columns into its own variable (comb.var[, i]) and then combined that with each column name pair to create another variable output$fct and the relabeled the new variable which represents each q-pair x value-pair combination and counted the occurrence of each combination with summary()
code:
# dimensions of df
n = nrow(df) #rows
p = ncol(df) #columns
# unique pairs of q columns
pairs.n = choose(p - 1, 2) # number of unique pairs
pairs = combn(1:(p - 1), 2) # matrix of those pairs
# data frame of NAs of proper size
comb.var <- matrix(NA, nrow = n, ncol = pairs.n)
for(combo in 1:ncol(pairs)){
i = pairs[1, combo]
j = pairs[2, combo]
# get the right 2 columns from df
qi = df[, i + 1]
qj = df[, j + 1]
# combine into 1 variable
comb.var[, combo] <- paste(qi, qj, sep = "_")
}
# clean up the output: turn out.M into vector and add id columns
output = data.frame(data.frame(id = rep(df$id, times = pairs.n),
qi = rep(pairs[1, ], each = n),
qj = rep(pairs[2, ], each = n),
val = as.vector(comb.var)))
# combine variables again
output$fct = with(output, paste(qi, qj, val, sep = "."))
# count number of different outputs
uniq.n = length(unique(output$fct))
# re-label the factor
output$fct <- factor(output$fct, labels = LETTERS[1:uniq.n])
# count the group members
summary(output$fct)

Related

Select values when column names are stored as concatenated strings

It's hard to explain, so I'll start with an example. I have some numeric columns (A, B, C). The column 'tmp' contains variable names of the numeric columns as concatenated strings:
set.seed(100)
A <- floor(runif(5, min=0, max=10))
B <- floor(runif(5, min=0, max=10))
C <- floor(runif(5, min=0, max=10))
tmp <- c('A','B,C','C','A,B','A,B,C')
df <- data.frame(A,B,C,tmp)
A B C tmp
1 3 4 6 A
2 2 8 8 B,C
3 5 3 2 C
4 0 5 3 A,B
5 4 1 7 A,B,C
Now, for each row, I want to use the variable names in tmp to select the values from the corresponding numeric columns with the same name(s). Then I want to keep only the rows where all the selected values are less than or equal 3.
E.g. in the first row, tmp is A, and the corresponding value in column A is 3, i.e. keep this row.
Another example, in row 4, tmp is A,B. The corresponding values are A = 0 and B = 5. Thus, all selected values are not less than or equal 3, and this row is discarded.
Desired result:
A B C tmp
1 3 4 6 A
2 5 3 2 C
How can I perform such filtering?
This is a bit more complicated than I like and there might be a more elegant solution, but here we go:
#split tmp
col <- strsplit(df[["tmp"]], ",")
#create an index matrix
inds <- do.call(rbind, Map(data.frame, row = seq_along(col), col = col))
inds$col <- match(inds$col, names(df))
inds <- as.matrix(inds)
#check
chk <- m <- as.matrix(df[, names(df) != "tmp"])
mode(chk) <- "logical"
chk[] <- NA
chk[inds] <- m[inds] <= 3
sel <- apply(chk, 1, prod, na.rm = TRUE)
df[as.logical(sel),]
# A B C tmp
#1 3 4 6 A
#3 5 3 2 C
Not sure if it works always (and probably isn't the best solution)... but it worked here:
library(dplyr)
library(tidyr)
library(stringr)
List= vector("list")
for (i in 1:length(df)){
tmpT= as.vector(str_split(df$tmp[i], ",", simplify=TRUE))
selec= df %>%
select(tmpT) %>%
slice(which(row_number() == i)) %>%
filter_all(., all_vars(. <= 3)) %>%
unite(val, sep= ", ")
if (nrow(selec) == 0) {
tab= NA
} else{
tab= df[i,]
}
List[[i]] = tab
}
df2= do.call("rbind", List)
This answer has some similarities with #Roland's, but here we work with the data in a 'longer' format:
# create row index
df$ri = seq_len(nrow(df))
# split the concatenated column
l <- strsplit(df$tmp, ',')
# repeat each row of the data with the lengths of the split string,
# bind with individual strings
d = cbind(df[rep(1:nrow(df), lengths(l)), ], x = unlist(l))
# use match to grab values from corresponding columns
d$val <- d[cbind(seq(nrow(d)), match(d$x, names(d)))]
# for each original row 'ri', check if all values are <= 3. use result to index data frame
d[as.logical(ave(d$val, d$ri, FUN = function(x) all(x <= 3))), ]
# A B C tmp ri x val
# 1 3 4 6 A 1 A 3
# 3 5 3 2 C 3 C 2

Take sum of rows for every 3 columns in a dataframe

I have searched high and low and also tried multiple options to solve this but did not get the desired output as mentioned below:
I have dataframe df3 with headers as date and values beteween 0-1 as shown below:
df = data.frame(replicate(6,sample(0:1,6,rep=TRUE)))
colnames(df) = c("1/1/2018","1/2/2018","1/3/2018","1/4/2018","1/5/2018","1/6/2018")
df2 = data.frame(c("A","B","C","D","E","F"))
colnames(df2) = c("CUST_ID")
df3 = cbind(df2,df)
Now I need df4 in which sum of first 3 columns in series will form one column. This will be repeated in series for rest of the columns dynamically.
df4
Options I tried:
a) rbind.data.frame(apply(matrix(df3, nrow = n - 1), 1,sum))
b) col_list <- list(c("1/1/2018","1/2/2018","1/3/2018"), c("1/4/2018","1/5/2018","1/6/2018"))
lapply(col_list, function(x)sum(df3[,x])) %>% data.frame
One way would be to split df3 every 3 columns using split.default. To split the data we generate a sequence using rep, then for each dataframe we take rowSums and finally cbind the result together.
cbind(df3[1], sapply(split.default(df3[-1],
rep(1:ncol(df3), each = 3, length.out = (ncol(df3) -1))), rowSums))
# CUST_ID 1 2
#1 A 1 1
#2 B 2 0
#3 C 2 1
#4 D 1 1
#5 E 2 2
#6 F 2 2
FYI, the sequence generated from rep is
rep(1:ncol(df3), each = 3, length.out = (ncol(df3) -1))
#[1] 1 1 1 2 2 2
This makes it possible to split every 3 columns.
The results are different because OP used sample without set.seed.
If rep seems too long then we can generate the same sequence of columns using gl
gl(ncol(df3[-1])/3, 3)
#[1] 1 1 1 2 2 2
#Levels: 1 2
So the final code, would be
cbind(df3[1], sapply(split.default(df3[-1], gl(ncol(df3[-1])/3, 3)), rowSums))
We can use seq to create index, get the subset of columns within in a list, Reduce by taking the sum, and create new columns
df4 <- df3[1]
df4[paste0('col', c('123', '456'))] <- lapply(seq(2, ncol(df3), by = 3),
function(i) Reduce(`+`, df3[i:min((i+2), ncol(df3))]))
df4
# CUST_ID col123 col456
#1 A 2 2
#2 B 3 3
#3 C 1 3
#4 D 2 3
#5 E 2 1
#6 F 0 1
data
set.seed(123)
df <- data.frame(replicate(6,sample(0:1,6,rep=TRUE)))
colnames(df) <- c("1/1/2018","1/2/2018","1/3/2018","1/4/2018","1/5/2018","1/6/2018")
df2 <- data.frame(c("A","B","C","D","E","F"))
colnames(df2) = c("CUST_ID")
df3 <- cbind(df2, df)

Sum Values of Every Column in Data Frame with Conditional For Loop

So I want to go through a data set and sum the values from each column based on the condition of my first column. The data and my code so far looks like this:
x v1 v2 v3
1 0 1 5
2 4 2 10
3 5 3 15
4 1 4 20
for(i in colnames(data)){
if(data$x>2){
x1 <-sum(data[[i]])
}
else{
x2 <-sum(data[[i]])
}
}
My assumption was that the for loop would call each column by name from the data and then sum the values in each column based on whether they matched the condition of column x.
I want to sum half the values from each column and assign them to a value x1 and do the same for the remainder, assigning it to x2. I keep getting an error saying the following:
the condition has length > 1 and only the first element will be used
What am I doing wrong and is there a better way to go about this? Ideally I want a table that looks like this:
v1 v2 v3
x1 6 7 35
x2 4 3 15
Here's a dplyr solution. First, I define the data frame.
df <- read.table(text = "x v1 v2 v3
1 0 1 5
2 4 2 10
3 5 3 15
4 1 4 20", header = TRUE)
# x v1 v2 v3
# 1 1 0 1 5
# 2 2 4 2 10
# 3 3 5 3 15
# 4 4 1 4 20
Then, I create a label (x_check) to indicate which group each row belongs to based on your criterion (x > 2), group by this label, and summarise each column with a v in its name using sum.
# Load library
library(dplyr)
df %>%
mutate(x_check = ifelse(x>2, "x1", "x2")) %>%
group_by(x_check) %>%
summarise_at(vars(contains("v")), funs(sum))
# # A tibble: 2 x 4
# x_check v1 v2 v3
# <chr> <int> <int> <int>
# 1 x1 6 7 35
# 2 x2 4 3 15
Not sure if I understood your intention correctly, but here is how you would reproduce your results with base R:
df <- data.frame(
x = c(1:4),
v1 = c(0, 4, 5, 1),
v2 = 1:4,
v3 = (1:4)*5
)
x1 <- colSums(df[df$x > 2, 2:4, drop = FALSE])
x2 <- colSums(df[df$x <= 2, 2:4, drop = FALSE])
Where
df[df$x > 2, 2:4, drop = FALSE] will create a subset of df where the rows satisfy df$x > 2 and the columns are 2:4 (meaning the second, third and fourth column), drop = FALSE is there mainly to prevent R from simplifying the results in some special cases
colSums does a by-column sum on the subsetted data.frame
If your x column was really a condition (e.g. a logical vector) you could just do
x1 <- colSums(df[df$x, 2:4, drop = FALSE])
x2 <- colSums(df[!df$x, 2:4, drop = FALSE])
Note that there is no loop needed to get to the results, with R you should use vectorized functions as much as possible.
More generally, you could do such aggregation with aggregate:
aggregate(df[, 2:4], by = list(condition = df$x <= 2), FUN = sum)

Speed up the lookup procedure

I have two tables: coc_data and DT. coc_data table contains co-occurrence frequency between pair of words. Its structure is similar to:
word1 word2 freq
1 A B 1
2 A C 2
3 A D 3
4 A E 2
Second table, DT contains frequencies for each word for different years, e.g.:
word year weight
1 A 1966 9
2 A 1967 3
3 A 1968 1
4 A 1969 4
5 A 1970 10
6 B 1966 9
In reality, coc_data has currently 150.000 rows and DT has about 450.000 rows. Below is R code, which simulate both datasets.
# Prerequisites
library(data.table)
set.seed(123)
n <- 5
# Simulate co-occurrence data [coc_data]
words <- LETTERS[1:n]
# Times each word used
freq <- sample(10, n, replace = TRUE)
# Co-occurrence data.frame
coc_data <- setNames(data.frame(t(combn(words,2))),c("word1", "word2"))
coc_data$freq <- apply(combn(freq, 2), 2, function(x) sample(1:min(x), 1))
# Simulate frequency table [DT]
years <- (1965 + 1):(1965 + 5)
word <- sort(rep(LETTERS[1:n], 5))
year <- rep(years, 5)
weight <- sample(10, 25, replace = TRUE)
freq_data <- data.frame(word = word, year = year, weight = weight)
# Combine to data.table for speed
DT <- data.table(freq_data, key = c("word", "year"))
My task is to normalize frequencies in coc_data table according to frequencies in DT table using the following function:
my_fun <- function(x, freq_data, years) {
word1 <- x[1]
word2 <- x[2]
freq12 <- as.numeric(x[3])
freq1 <- sum(DT[word == word1 & year %in% years]$weight)
freq2 <- sum(DT[word == word2 & year %in% years]$weight)
ei <- (freq12^2) / (freq1 * freq2)
return(ei)
}
Then I use apply() function to apply my_fun function to each row of the coc_data table:
apply(X = coc_data, MARGIN = 1, FUN = my_fun, freq_data = DT, years = years)
Because DT lookup table is quite large the whole mapping process take very long. I wonder how could I improve my code to speed up the computation.
Since the years parameter is constant in my_fun for the actual usage using apply, you could compute the frequencies for all words first:
f<-aggregate(weight~word,data=DT,FUN=sum)
Now transform this into a hash, e.g.:
hs<-f$weight
names(hs)<-f$word
Now in my_fun use the precomputed frequencies by looking up hs[word]. This should be faster.
Even better - the answer you're looking for is
(coc_data$freq)^2 / (hs[coc_data$word1] * hs[coc_data$word2])
The data.table implementation of this would be:
f <- DT[, sum(weight), word]
vec <- setNames(f$V1, f$word)
setDT(coc_data)[, freq_new := freq^2 / (vec[word1] * vec[word2])]
which gives the following result:
> coc_data
word1 word2 freq freq_new
1: A B 1 0.0014792899
2: A C 1 0.0016025641
3: A D 1 0.0010683761
4: A E 1 0.0013262599
5: B C 5 0.0434027778
6: B D 1 0.0011574074
7: B E 1 0.0014367816
8: C D 4 0.0123456790
9: C E 1 0.0009578544
10: D E 2 0.0047562426

R: split a data-frame, apply a function to all row-pairs in each subset

I am new to R and am trying to accomplish the following task efficiently.
I have a data.frame, x, with columns: start, end, val1, val2, val3, val4. The columns are sorted/ordered by start.
For each start, first I have to find all the entries in x that share the same start. Because the list is ordered, they will be consecutive. If a particular start occurs only once, then I ignore it. Then, for these entries that have the same start, lets say for one particular start, there are 3 entries, as shown below:
entries for start=10
start end val1 val2 val3 val4
10 25 8 9 0 0
10 55 15 200 4 9
10 30 4 8 0 1
Then, I have to take 2 rows at a time and perform a fisher.test on the 2x4 matrices of val1:4. That is,
row1:row2 => fisher.test(matrix(c(8,15,9,200,0,4,0,9), nrow=2))
row1:row3 => fisher.test(matrix(c(8,4,9,8,0,0,0,1), nrow=2))
row2:row3 => fisher.test(matrix(c(15,4,200,8,4,0,9,1), nrow=2))
The code I wrote is accomplished using for-loops, traditionally. I was wondering if this could be vectorized or improved in anyway.
f_start = as.factor(x$start) #convert start to factor to get count
tab_f_start = as.table(f_start) # convert to table to access count
o_start1 = NULL
o_end1 = NULL
o_start2 = NULL
o_end2 = NULL
p_val = NULL
for (i in 1:length(tab_f_start)) {
# check if there are more than 1 entries with same start
if ( tab_f_start[i] > 1) {
# get all rows for current start
cur_entry = x[x$start == as.integer(names(tab_f_start[i])),]
# loop over all combinations to obtain p-values
ctr = tab_f_start[i]
for (j in 1:(ctr-1)) {
for (k in (j+1):ctr) {
# store start and end values separately
o_start1 = c(o_start1, x$start[j])
o_end1 = c(o_end1, x$end[j])
o_start2 = c(o_start2, x$start[k])
o_end2 = c(o_end2, x$end[k])
# construct matrix
m1 = c(x$val1[j], x$val1[k])
m2 = c(x$val2[j], x$val2[k])
m3 = c(x$val3[j], x$val3[k])
m4 = c(x$val4[j], x$val4[k])
m = matrix(c(m1,m2,m3,m4), nrow=2)
p_val = c(p_val, fisher.test(m))
}
}
}
}
result=data.frame(o_start1, o_end1, o_start2, o_end2, p_val)
Thank you!
As #Ben Bolker suggested, you can use the plyr package to do this compactly. The first step is to create
a wider data-frame that contains the desired row-pairs. The row-pairs are generated using the combn function:
set.seed(1)
x <- data.frame( start = c(1,2,2,2,3,3,3,3),
end = 1:8,
v1 = sample(8), v2 = sample(8), v3 = sample(8), v4 = sample(8))
require(plyr)
z <- ddply(x, .(start), function(d) if (nrow(d) == 1) NULL
else {
row_pairs <- combn(nrow(d),2)
cbind( a = d[ row_pairs[1,], ],
b = d[ row_pairs[2,], ] )
})[, -1]
The second step is to extract the p.value from applying the fisher.test to each row-pair:
result <- ddply(z, .(a.start, a.end, b.start, b.end),
function(d)
fisher.test(matrix(unlist( d[, -c(1,2,7,8) ]),
nrow=2, byrow=TRUE))$p.value )
> result
a.start a.end b.start b.end V1
1 2 2 2 3 0.33320784
2 2 2 2 4 0.03346192
3 2 3 2 4 0.84192284
4 3 5 3 6 0.05175017
5 3 5 3 7 0.65218289
6 3 5 3 8 0.75374989
7 3 6 3 7 0.34747011
8 3 6 3 8 0.10233072
9 3 7 3 8 0.52343422

Resources