extract information from a data frame

extract information from a data frame - r

I have a data frame like below
df<- structure(list(s1 = structure(1:3, .Label = c("3-4", "4-1", "5-4"
), class = "factor"), s2 = structure(1:3, .Label = c("2-4", "3-15",
"7-16"), class = "factor")), .Names = c("s1", "s2"), row.names = c(NA,
-3L), class = "data.frame")
Looks like below
> df
# s1 s2
#1 3-4 2-4
#2 4-1 3-15
#3 5-4 7-16
what I want to do is to first search and find those values that are similar after -
for example here 4 is in first row of s1, first row of s2 and third row of s1
-The second column indicates how many times those values were found
-The third column shows how many of them are from first column of df
-The fourth column shows how many of them are from second column of df
-The fifth is which strings are from the first columns
-The sixth is which strings are from teh second columns
the output looks like this
Value repeated s1N s1N ss1 ss2
4 3 2 1 3,5 2
1 1 1 - 4 -
15 1 - 1 - 3
16 1 - 1 - 7

Surprisingly tough problem. It's good to break it down into several logical steps:
## 1: split into (val,ss) pairs, and capture ci (column index) association
res <- setNames(do.call(rbind,lapply(seq_along(df),function(ci)
do.call(rbind,lapply(strsplit(as.character(df[[ci]]),'-'),function(x)
data.frame(x[2L],x[1L],ci,stringsAsFactors=F)
))
)),c('val','ss','ci'));
res;
## val ss ci
## 1 4 3 1
## 2 1 4 1
## 3 4 5 1
## 4 4 2 2
## 5 15 3 2
## 6 16 7 2
## 2: aggregate ss (joining on comma) by (val,ci), and capture record count as n
res <- do.call(rbind,by(res,res[c('val','ci')],function(x)
data.frame(val=x$val[1L],ci=x$ci[1L],n=nrow(x),ss=paste(x$ss,collapse=','),stringsAsFactors=F)
));
res;
## val ci n ss
## 1 1 1 1 4
## 2 4 1 2 3,5
## 3 15 2 1 3
## 4 16 2 1 7
## 5 4 2 1 2
## 3: reshape to wide format
res <- reshape(res,idvar='val',timevar='ci',dir='w');
res;
## val n.1 ss.1 n.2 ss.2
## 1 1 1 4 NA <NA>
## 2 4 2 3,5 1 2
## 3 15 NA <NA> 1 3
## 4 16 NA <NA> 1 7
## 4: add repeated column; can be calculated by summing all n.* columns
## note: leveraging psum() from <http://stackoverflow.com/questions/12139431/add-variables-whilst-ignoring-nas-using-transform-function>
psum <- function(...,na.rm=F) { x <- list(...); rowSums(matrix(unlist(x),ncol=length(x)),na.rm=na.rm); };
res$repeated <- do.call(psum,c(res[grep('^n\\.[0-9]+$',names(res))],na.rm=T));
res;
## val n.1 ss.1 n.2 ss.2 repeated
## 1 1 1 4 NA <NA> 1
## 2 4 2 3,5 1 2 3
## 3 15 NA <NA> 1 3 1
## 4 16 NA <NA> 1 7 1
With regard to the NAs, you can fix them up afterward if you want. However, I would advise that the proper type of the n.* columns is integer, since they represent counts, therefore the use of '-' (as in your sample output) to represent null cells is inappropriate. I would suggest zero instead. The dash is fine for the ss.* columns, since they are strings. Here's how you can do this:
n.cis <- grep('^n\\.[0-9]+$',names(res));
ss.cis <- grep('^ss\\.[0-9]+$',names(res));
res[n.cis][is.na(res[n.cis])] <- 0L;
res[ss.cis][is.na(res[ss.cis])] <- '-';
res;
## val n.1 ss.1 n.2 ss.2 repeated
## 1 1 1 4 0 - 1
## 2 4 2 3,5 1 2 3
## 3 15 0 - 1 3 1
## 4 16 0 - 1 7 1

First thing you will need to do is extract the numbers from your strings. Running:
newdfstring <- apply(df,1:2, function(v) unlist(strsplit(v,"-")))
newdf <- apply(newdfstring,1:3, as.numeric)
splits the strings in the first line, and converts them to numeric values in the second. The result is a 3-dimensional matrix which you can use to extract your values.
First create a new dataframe:
#length of the columns in the new frame = number of unique values
dflength <- length(unique(array(newdf[2,,])))
dfout <- data.frame(Value=rep(0,dflength),repeated=rep(0,dflength),s1N=rep(0,dflength),s2N=rep(0,dflength),ss1=rep(0,dflength),ss2=rep(0,dflength))
The most obvious way (yet maybe not the most efficient) would then be to loop and match whatever it is you need:
dfout$Value <- unique(array(newdf[2,,]))
for(i in 1:dflength){
getID <- which(as.data.frame(table(newdf[2,,]))$Var1==dfout$Value[i])
dfout$repeated[i] <- as.data.frame(table(newdf[2,,]))$Freq[getID]
dfout$s1N[i] <- as.data.frame(table(newdf[2,,1]))$Freq[getID]
if(is.na(dfout$s1N[i])){
dfout$s1N[i] <- 0
}
dfout$s2N[i] <- as.data.frame(table(newdf[2,,2]))$Freq[getID]
if(is.na(dfout$s2N[i])){
dfout$s2N[i] <- 0
}
getID <- which(newdf[2,,1]==dfout$Value[i])
if(length(getID)>0){
dfout$ss1[i] <- toString(newdf[1,,1][getID])
} else {
dfout$ss1[i] <- 0
}
getID <- which(newdf[2,,2]==dfout$Value[i])
if(length(getID)>0){
dfout$ss2[i] <- toString(newdf[1,,2][getID])
} else {
dfout$ss2[i] <- 0
}
}
dfout
# Value repeated s1N s2N ss1 ss2
#1 4 3 2 1 3, 5 2
#2 1 1 1 1 4 0
#3 15 1 0 1 0 3
#4 16 1 0 0 0 7
EDIT to loop n amount of s values
newdfstring <- apply(df,1:2, function(v) unlist(strsplit(v,"-")))
newdf <- apply(newdfstring,1:3, as.numeric)
dflength <- length(unique(array(newdf[2,,])))
#find the number of s variables
slength <- length(newdf[1,1,])
#create a matrix of appropriate size
dfout <- matrix(data=NA,nrow=dflength,ncol=(2+2*slength))
#create a (near)-empty names array, we will fill it in later
names <- c("Value","repeated",rep("",2*slength))
#fill in the Values column
dfout[,1] <- unique(array(newdf[2,,]))
#loop for every s variable
for(j in 1:slength){
#get their names, paste N or s and add them to the names array
names[2+j] <- paste(names(df)[j],"N",sep="")
names[2+j+slength] <- paste("s",names(df)[j],sep="")
#loop to get the other values
for(i in 1:dflength){
getID <- which(as.data.frame(table(newdf[2,,]))$Var1==dfout[i,1])
dfout[i,2] <- as.data.frame(table(newdf[2,,]))$Freq[getID]
dfout[i,2+j] <- as.data.frame(table(newdf[2,,j]))$Freq[getID]
if(is.na(dfout[i,2+j])){
dfout[i,2+j] <- 0
}
getID <- which(newdf[2,,j]==dfout[i,1])
if(length(getID)>0){
dfout[i,2+j+slength] <- toString(newdf[1,,j][getID])
} else {
dfout[i,2+j+slength] <- 0
}
}
}
colnames(dfout)<-names
as.data.frame(dfout)
# Value repeated s1N s2N ss1 ss2
#1 4 3 2 1 3, 5 2
#2 1 1 1 1 4 0
#3 15 1 0 1 0 3
#4 16 1 0 0 0 7

df <-
structure(
list(
s1 = structure(1:3, .Label = c("3-4", "4-1", "5-4"), class = "factor"),
s2 = structure(1:3, .Label = c("2-4", "3-15", "7-16"), class = "factor"
)
), .Names = c("s1", "s2"), row.names = c(NA,-3L), class = "data.frame"
)
library(tidyr)
library(dplyr)
# Split columns at "-" and add to data.frame
splitCols <- function(df) {
new_headers <- paste("s1", c("1st", "2nd"), sep = "_")
split_1 <- (separate(df, s1, into = new_headers, sep = "-"))[,new_headers]
split_1$s1_1st <- as.integer(split_1$s1_1st)
split_1$s1_2nd <- as.integer(split_1$s1_2nd)
new_headers <- paste("s2", c("1st", "2nd"), sep = "_")
split_2 <- (separate(df, s2, into = new_headers, sep = "-"))[,new_headers]
split_2$s2_1st <- as.integer(split_2$s2_1st)
split_2$s2_2nd <- as.integer(split_2$s2_2nd)
cbind(df, split_1, split_2)
}
# given a df outputted from splitCols return final df
analyzeDF <- function(df) {
target_vals <- unique(c(df$s1_2nd, df$s2_2nd)) # for each uniq val compute stuff
out_df <- data.frame(Value = integer(0),
repeated = integer(0),
s1N = integer(0),
s2N = integer(0),
ss1 = character(0),
ss2 = character(0))
# iterate through target_vals, create a row of output,
# and append to out_df
for (val in target_vals) {
s1_match <- val == df$s1_2nd
s2_match <- val == df$s2_2nd
total_cnt <- sum(s1_match, s2_match)
s1_firstcol <- paste(df$s1_1st[s1_match], collapse = ",")
s2_firstcol <- paste(df$s2_1st[s2_match], collapse = ",")
# coerce empty string to "-"
if (s1_firstcol == "") s1_firstcol <- "-"
if (s2_firstcol == "") s2_firstcol <- "-"
row_df <- data.frame(Value = val,
repeated = total_cnt,
s1N = sum(s1_match),
s2N = sum(s2_match),
ss1 = s1_firstcol,
ss2 = s2_firstcol)
out_df <- rbind(out_df, row_df)
}
return(out_df)
}
(df_split <- splitCols(df))
analyzeDF(df_split)
## Value repeated s1N s2N ss1 ss2
## 1 4 3 2 1 3,5 2
## 2 1 1 1 0 4 -
## 3 15 1 0 1 - 3
## 4 16 1 0 1 - 7

I have totally rewritten all the codes based on your real data, and I have tested it on my machine. Since it is a pretty big dataframe, it takes some time to run, and the loops are not avoidable in my opinion.
# function to split the strings
myfun<-function(x){
x<-strsplit(as.character(x), '-')
x1<-unlist(x)
x.new<-as.data.frame(matrix(x1, byrow = T, length(x)))
return(x.new)
}
# this returns a list of dataframes
list.v<-lapply(df[1:dim(df)[2]], myfun)
# like this
head(list.v[[17]])
# try to combine all the dfs, produced an error of mismatching # of columns
df2<-do.call(rbind, list.v)
# some of the dfs in list.v are all NA's, they should be dropped
sum<-summary(list.v)
list.v<-list.v[-which(sum[,1] != "2")] # this excludes those all-NA datafrmes in list.v
# now combine all dfs for indexing purposes
df2<-do.call(rbind, list.v)
# create "value", "repeated" column in the desired result df.
# These codes are same as my previous answer
value<-names(table(df2[,2]))
repeated<-as.vector(table(df2[,2]))
# create an empty list to store the counts columns
list.count<-vector("list", length = length(list.v))
# every df in list.v has same number of rows, get the row number
rownum<-nrow(list.v[[1]])
# use a for loop to fill out list.count
for(i in 0:(length(list.count)-1)){
row.start<-i*rownum+1 # it is kind of tricky here
row.end<-(i+1)*rownum # same as above
list.count[[i + 1]]<-as.vector(table(df2[,2][row.start:row.end]))
}
# combine the vectors in list.count and assing names
count.df<-do.call(cbind, list.count)
count.df<-as.data.frame(count.df)
# create & assign colum names in the format of "s_n", and "_" is filled with corresponding original column name
names.cnt<-character()
for(i in 1:length(names(list.v))){
names.cnt[i]<-paste("s", names(list.v)[i], "n", sep="")
}
names(count.df)<-names.cnt
# this is a very long loop to concatenate the strings and store them into a matrix, but it gets the job done here.
ss.store<-matrix(,nrow = length(value), ncol = length(list.v), byrow = FALSE)
for(i in 1:length(list.v)){
for(j in 1:length(value)){
ss.store[j,i]<-paste(list.v[[i]][,1][which(list.v[[i]][,2] == value[j])], collapse =",")
}
}
# create a df for strings
string.df<-as.data.frame(ss.store, stringsAsFactors = FALSE)
# create & assign names to the df
names.str<-character()
for(i in 1:length(names(list.v))){
names.str[i]<-paste("s", "s", names(list.v)[i], sep="")
}
names(string.df)<-names.str
# combine everything and form the new data frame
new.df<-cbind(value, repeated, count.df, string.df, stringAsFactors = FALSE)
new.df[1:10, 1:15]
value repeated sAn sF1n sF2n sF3n sF4n sF5n sF6n sF7n sF8n sF9n sF10n sF11n sF12n
1 100 155 3 0 0 0 0 0 0 0 0 0 0 0 0
2 1005 14 1 0 0 0 0 0 0 0 0 0 0 0 0
3 1006 50 1 0 0 0 0 0 0 0 0 0 0 0 0
4 1023 1 1 0 0 0 0 0 0 0 0 0 0 0 0
5 1025 38 1 0 0 0 0 0 0 0 0 0 0 0 0
6 1030 624 1 0 1 2 0 0 0 0 0 0 1 0 0
7 1035 1 1 0 0 0 0 0 0 0 0 0 0 0 0
8 104 165 2 0 0 0 0 0 0 0 0 0 0 0 0
9 1076 186 2 0 0 0 0 0 0 0 0 0 0 0 0
10 1078 333 3 0 0 0 0 0 0 0 0 0 0 0 0

Related

Separate long numbers into individual components in a data frame

I have a data frame with several hundred vectors that look like this
To analyze them I must split the columns so that the number is separated into its individual components in the rows beneath.
V1
0
0
0
0
0
0
...
I've tried using this code and tweaking it but I can't get it to work. There are 2225 columns and the vectors are not all the same size.
text_data <- read_excel("./data/wordcount_vectors.xlsx")
text_vector_data <- text_data %>% select(wordcountvec)
wordvec_list <- c()
for (i in 1:nrow(text_vector_data)){
text_vector_data[i,] <- removePunctuation(as.character(text_vector_data[i,]))
x <- as.list(text_vector_data[i,])
wordvec_list <- c(wordvec_list, x)
}
wordvec_df <- as.data.frame(wordvec_list)
df <- data.frame(matrix(ncol = 2225, nrow = 1106))
for (i in 1:ncol(text_vector_data)){ #Change range depending on size of c
c <- as.numeric(strsplit(as.character(wordvec_df[i]), "")[[1]])
dd[i] <- c[[i]]
}
word_vec_df <- Filter(function(x)!all(is.na(x)), dd)
row.names(word_vec_df)<- NULL ; colnames(word_vec_df)<- NULL
word_vec_df <- t(word_vec_df)
here's some toy data to try
v1 <- (100011000)
v2 <- (10102100)
v3 <- (1120210011)
wordcount_df <- data_frame(v1,v2,v3)

First, you need to read your data as character not numeric because otherwise the leading zeros will be lost:
text_data <- read_excel("./data/wordcount_vectors.xlsx", col_types="text")
Your example does not include spaces between the 1's and 0's but your picture does. Using your provided data:
wordcount_dfc <- as.character(wordcount_df)
wordcount_dfc
# [1] "100011000" "10102100" "1120210011"
wordcount_lst <- strsplit(wordcount_dfc, "") # Use " " if the values are separated by spaces
wordcount_lst <-sapply(wordcount_lst, as.integer)
wordcount_lst
# [[1]]
# [1] 1 0 0 0 1 1 0 0 0
#
# [[2]]
# [1] 1 0 1 0 2 1 0 0
#
# [[3]]
# [1] 1 1 2 0 2 1 0 0 1 1
sapply(wordcount_lst, length)
# [1] 9 8 10
You cannot just bind the columns together because they are of different lengths. The simplest approach would be to use wordcount_lst directly, but if you need to make a data frame, you need to add NAs to pad out short vectors:
size <- max(sapply(wordcount_lst, length))
wordcount_df <- data.frame(sapply(wordcount_lst, function(x) c(x, rep(NA, size - length(x)))))
wordcount_df
# X1 X2 X3
# 1 1 1 1
# 2 0 0 1
# 3 0 1 2
# 4 0 0 0
# 5 1 2 2
# 6 1 1 1
# 7 0 0 0
# 8 0 0 0
# 9 0 NA 1
# 10 NA NA 1

How to do a running calculation across vectors in R?

I created this data frame:
Count <- c(1:10)
Give <- c(0,0,5,0,0,5,0,5,0,5)
X <- c(rep(0,10))
Y <- c(rep(0,10))
Z <- c(rep(0,10))
X_Target <- 5
Y_Target <- 10
Z_Target <- 5
Basically I have 3 vectors (X,Y,Z) and a target for each one of them.
I want to have a new calculation for X,Y and Z that based on the vector Give.
Once the number on Give is bigger than 0 then it's need to be added to Vector X until it equel to X_Target. Then - the calcultion need to move to the next vector (Y) and do the same, and then to next vector...
The output should be like the following:
Count Give X Y Z
1 0 0 0 0
2 0 0 0 0
3 5 5 0 0
4 0 5 0 0
5 0 5 0 0
6 5 5 5 0
7 0 5 5 0
8 5 5 10 0
9 0 5 10 0
10 5 5 10 5
In this example I have only 3 vectors but please keep in mind that I'll have at least 60 vectors so I need it to be automatic as it can.
Hope I manage to explain myself :)
Thnanks!

It's ugly, but it gives the desired result.
tab1 = data.frame(
Count = c(1:10),
Give = c(0,0,5,0,0,5,0,5,0,5),
X = c(rep(0,10)),
Y = c(rep(0,10)),
Z = c(rep(0,10))
)
targets <- c(5,10,5)
tab2 <- tab1
start <- 2
for(col in 3:ncol(tab2)) {
target <- targets[col-2]
for(row in start:nrow(tab2)) {
if(tab2[row, 2] > 0 & tab2[row, col] < target) {
tab2[row, col] <- pmin(tab2[row - 1, col] + tab2[row, col - 1], target)
} else {
tab2[row, col] <- tab2[row - 1, col]
}
}
start <- which(tab2[, 2] > 0 & tab2[, col] == target)[2]
}
> tab2
Count Give X Y Z
1 1 0 0 0 0
2 2 0 0 0 0
3 3 5 5 0 0
4 4 0 5 0 0
5 5 0 5 0 0
6 6 5 5 5 0
7 7 0 5 5 0
8 8 5 5 10 0
9 9 0 5 10 0
10 10 5 5 10 5

Turn it into a data frame :
tab1 = data.frame(
Count = c(1:10),
Give =c(0,0,5,0,0,5,0,5,0,5),
X = c(rep(0,10)),
Y = c(rep(0,10)),
Z = c(rep(0,10))
)
# create a list of targets for looping
targets = c(X_Target, Y_Target, Z_Target)
Without using data.table you can just put the whole thing in a loop. It will work, but be much slower.
# loop through each column
for(col in seq(1,length(targets))){
print(col)
# loop through each row
for(row in seq(1, dim(tab1[2+col])[1])){
# condition
while(tab1[row,(2+col)] < targets[col] & tab1[row,2]>0){
tab1[row,(2+col)] = tab1[row,(2+col)] +tab1[row,2]
}
}
}

Here is something else to try, using tidyverse.
Put your data into long form, and include targets with a join.
In a loop through Count, find the first row for a given Count that is below target. For current and following rows that have matching names (X, Y, or Z), add Give amount.
In the end, put result back into wide form.
library(tidyverse)
df <- data.frame(Count, Give, X, Y, Z) %>%
pivot_longer(cols = X:Z) %>%
left_join(data.frame(X_Target, Y_Target, Z_Target) %>%
pivot_longer(cols = everything(),
names_to = c("name", ".value"),
names_pattern = "(\\w+)_(\\w+)"))
for (i in seq_along(Count)) {
below_target <- min(which(df$Count == i & df$value < df$Target))
name_rows <- which(df$name == df[below_target, "name", drop = T])
rows_to_change <- name_rows[name_rows >= below_target]
df[rows_to_change, "value"] <- df[rows_to_change, "value"] + df[below_target, "Give", drop = T]
}
df %>%
pivot_wider(id_cols = Count)
Output
Count X Y Z
<int> <dbl> <dbl> <dbl>
1 1 0 0 0
2 2 0 0 0
3 3 5 0 0
4 4 5 0 0
5 5 5 0 0
6 6 5 5 0
7 7 5 5 0
8 8 5 10 0
9 9 5 10 0
10 10 5 10 5

My approach was to make use of the cumulative sums of the Give and then track if that exceeds the targeted values for the columns. Then do some cleaning up.
targets <- c(X_Target, Y_Target, Z_Target)
targets_0 <- c(0, targets)
csum_give <- cumsum(Give)
# from cumsum give take off sum of previous targets
result <- sapply(1:length(targets),
function(x) csum_give - sum(targets_0[1:x]))
# Set max value to target max of column
sapply(1:length(targets),
function(x) result[result[, x] > targets[x], x] <<- targets[x])
# set min value to zero
result[which(result < 0)] <- 0
result
# [,1] [,2] [,3]
# [1,] 0 0 0
# [2,] 0 0 0
# [3,] 5 0 0
# [4,] 5 0 0
# [5,] 5 0 0
# [6,] 5 5 0
# [7,] 5 5 0
# [8,] 5 10 0
# [9,] 5 10 0
# [10,] 5 10 5

multiply multiple column and find sum of each column for multiple values

I'm trying to multiply column and get its names.
I have a data frame:
v1 v2 v3 v4 v5
0 1 1 1 1
0 1 1 0 1
1 0 1 1 0
I'm trying to multiplying each column with other, like:
v1v2
v1v3
v1v4
v1v5
and
v2v3
v2v4
v2v5
etc, and
v1v2v3
v1v2v4
v1v2v5
v2v3v4
v2v3v5
4 combination and 5 combination...if there is n column then n combination.
I'm try to use following code in while loop, but it is not working:
i<-1
while(i<=ncol(data)
{
results<-data.frame()
v<-i
results<- t(apply(data,1,function(x) combn(x,v,prod)))
comb <- combn(colnames(data),v)
colnames(results) <- apply(comb,v,function(x) paste(x[1],x[2],sep="*"))
results <- colSums(results)
}
but it is not working.
sample out put..
if n=3
v1v2 v1v3 v2v3
0 0 1
0 0 1
0 1 0
and colsum
v1v2 v1v3 v2v3
0 1 2
then
v1v2=0
v1v3=1
v2v3=2
this one is I'm trying?

Try this:
df <- read.table(text = "v1 v2 v3 v4 v5
0 1 1 1 1
0 1 1 0 1
1 0 1 1 0", skip = 1)
df
ll <- vector(mode = "list", length = ncol(df)-1)
ll <- lapply(2:ncol(df), function(ncols){
tmp <- t(apply(df, 1, function(rows) combn(x = rows, m = ncols, prod)))
if(ncols < ncol(df)){
tmp <- colSums(tmp)
}
else{
tmp <- sum(tmp)
}
names1 <- t(combn(x = colnames(df), m = ncols))
names(tmp) <- apply(names1, 1, function(rows) paste0(rows, collapse = ""))
ll[[ncols]] <- tmp
})
ll
# [[1]]
# V1V2 V1V3 V1V4 V1V5 V2V3 V2V4 V2V5 V3V4 V3V5 V4V5
# 0 1 1 0 2 1 2 2 2 1
#
# [[2]]
# V1V2V3 V1V2V4 V1V2V5 V1V3V4 V1V3V5 V1V4V5 V2V3V4 V2V3V5 V2V4V5 V3V4V5
# 0 0 0 1 0 0 1 2 1 1
#
# [[3]]
# V1V2V3V4 V1V2V3V5 V1V2V4V5 V1V3V4V5 V2V3V4V5
# 0 0 0 0 1
#
# [[4]]
# V1V2V3V4V5
# 0
Edit following comment
The results of the different set of column combinations can then be accessed by indexing (subsetting) the list. E.g. to access the "2 combinations", select the first element of the list, to access the "3rd combination", select the second element of the list, et c.
ll[[1]]
# V1V2 V1V3 V1V4 V1V5 V2V3 V2V4 V2V5 V3V4 V3V5 V4V5
# 0 1 1 0 2 1 2 2 2 1

rbind list of arbitrary number of dataframes

I have a list of dataframes with some overlapping columns in each. The number of dataframes in the list is unknown. How can I efficiently, in base, rbind the dataframes together and fill in non overlapping columns with zeros?
Example data:
x <- data.frame(a=1:2, b=1:2, c=1:2)
y <- data.frame(a=1:2, r=1:2, f=1:2)
z <- data.frame(b=1:3, c=1:3, v=1:3, t=c("A", "A", "D"))
L1 <- list(x, y, z)
Desired output:
a b c f r t v
1 1 1 1 0 0 0 0
2 2 2 2 0 0 0 0
3 1 0 0 1 1 0 0
4 2 0 0 2 2 0 0
5 0 1 1 0 0 A 1
6 0 2 2 0 0 A 2
7 0 3 3 0 0 D 3

Pad out each data frame with the missing columns, then rbind them:
allnames <- unique(unlist(lapply(L1, names)))
do.call(rbind, lapply(L1, function(df) {
not <- allnames[!allnames %in% names(df)]
df[, not] <- 0
df
}))

I have an old (and probably inefficient) function that does this. I've made one modification here to allow the fill to be specified.
RBIND <- function(datalist, keep.rownames = TRUE, fill = NA) {
Len <- sapply(datalist, ncol)
if (all(diff(Len) == 0)) {
temp <- names(datalist[[1]])
if (all(sapply(datalist, function(x) names(x) %in% temp))) tryme <- "basic"
else tryme <- "complex"
}
else tryme <- "complex"
almost <- switch(
tryme,
basic = { do.call("rbind", datalist) },
complex = {
Names <- unique(unlist(lapply(datalist, names)))
NROWS <- c(0, cumsum(sapply(datalist, nrow)))
NROWS <- paste(NROWS[-length(NROWS)]+1, NROWS[-1], sep=":")
out <- lapply(1:length(datalist), function(x) {
emptyMat <- matrix(fill, nrow = nrow(datalist[[x]]), ncol = length(Names))
colnames(emptyMat) <- Names
emptyMat[, match(names(datalist[[x]]),
colnames(emptyMat))] <- as.matrix(datalist[[x]])
emptyMat
})
do.call("rbind", out)
})
Final <- as.data.frame(almost, row.names = 1:nrow(almost))
Final <- data.frame(lapply(Final, function(x) type.convert(as.character(x))))
if (isTRUE(keep.rownames)) {
row.names(Final) <- make.unique(unlist(lapply(datalist, row.names)))
}
Final
}
Here it is on your sample data.
RBIND(L1, fill = 0)
# a b c r f v t
# 1 1 1 1 0 0 0 0
# 2 2 2 2 0 0 0 0
# 1.1 1 0 0 1 1 0 0
# 2.1 2 0 0 2 2 0 0
# 1.2 0 1 1 0 0 1 A
# 2.2 0 2 2 0 0 2 A
# 3 0 3 3 0 0 3 D

Aggregating every 10 columns in binary matrice

I am new to R.
I would like to transform a binary matrix like this:
example:
" 1874 1875 1876 1877 1878 .... 2009
F 1 0 0 0 0 ... 0
E 1 1 0 0 0 ... 0
D 1 1 0 0 0 ... 0
C 1 1 0 0 0 ... 0
B 1 1 0 0 0 ... 0
A 1 1 0 0 0 ... 0"
Since, columns names are years I would like to aggregate them in decades and obtain something like:
"1840-1849 1850-1859 1860-1869 .... 2000-2009
F 1 0 0 0 0 ... 0
E 1 1 0 0 0 ... 0
D 1 1 0 0 0 ... 0
C 1 1 0 0 0 ... 0
B 1 1 0 0 0 ... 0
A 1 1 0 0 0 ... 0"
I am used to python and do not know how to do this transformation without making loops!
Thanks, isabel

It is unclear what aggregation you want, but using the following dummy data
set.seed(42)
df <- data.frame(matrix(sample(0:1, 6*25, replace = TRUE), ncol = 25))
names(df) <- 1874 + 0:24
The following counts events in each 10-year period.
Get the years as a numeric variable
years <- as.numeric(names(df))
Next we need an indicator for the start of each decade
ind <- seq(from = signif(years[1], 3), to = signif(tail(years, 1), 3), by = 10)
We then apply over the indices of ind (1:(length(ind)-1)), select columns from df that are the current decade and count the 1s using rowSums.
tmp <- lapply(seq_along(ind[-1]),
function(i, inds, data) {
rowSums(data[, names(data) %in% inds[i]:(inds[i+1]-1)])
}, inds = ind, data = df)
Next we cbind the resulting vectors into a data frame and fix-up the column names:
out <- do.call(cbind.data.frame, tmp)
names(out) <- paste(head(ind, -1), tail(ind, -1) - 1, sep = "-")
out
This gives:
> out
1870-1879 1880-1889 1890-1899
1 4 5 6
2 4 6 6
3 2 5 5
4 5 5 7
5 3 3 7
6 5 5 4
If you want simply a binary matrix with a 1 indicating at least 1 event happened in that decade, then you can use:
tmp2 <- lapply(seq_along(ind[-1]),
function(i, inds, data) {
as.numeric(rowSums(data[, names(data) %in% inds[i]:(inds[i+1]-1)]) > 0)
}, inds = ind, data = df)
out2 <- do.call(cbind.data.frame, tmp2)
names(out2) <- paste(head(ind, -1), tail(ind, -1) - 1, sep = "-")
out2
which gives:
> out2
1870-1879 1880-1889 1890-1899
1 1 1 1
2 1 1 1
3 1 1 1
4 1 1 1
5 1 1 1
6 1 1 1
If you want a different aggregation, then modify the function applied in the lapply call to use something other than rowSums.

This is another option, using modular arithmetic to aggregate the columns.
# setup, borrowed from #GavinSimpson
set.seed(42)
df <- data.frame(matrix(sample(0:1, 6*25, replace = TRUE), ncol = 25))
names(df) <- 1874 + 0:24
result <- do.call(cbind,
by(t(df), as.numeric(names(df)) %/% 10 * 10, colSums))
# add -xxx9 to column names, for each decade
dimnames(result)[[2]] <- paste(colnames(result), as.numeric(colnames(result)) + 9, sep='-')
# 1870-1879 1880-1889 1890-1899
# V1 4 5 6
# V2 4 6 6
# V3 2 5 5
# V4 5 5 7
# V5 3 3 7
# V6 5 5 4
If you wanted to aggregate with something other than sum, replace the call to
colSums with something like function(cols) lapply(cols, f), where f is the aggregating
function, e.g., max.

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

extract information from a data frame - r

Related

Separate long numbers into individual components in a data frame

How to do a running calculation across vectors in R?

multiply multiple column and find sum of each column for multiple values

rbind list of arbitrary number of dataframes

Aggregating every 10 columns in binary matrice

Categories

Resources