Create new data frame depending on the most extreme value in rows - r

I have the following data frame and I would like to create a new one that will be like the one below.
ID1 ID2 ID3 ID4
x1_X 0 10 4 7
x2_X 2 12 5 8
x3_X 3 1 3 5
y1_Y 4 13 6 4
y2_Y 5 14 1 9
y3_Y 2 11 1 5
y4_Y 1 1 2 3
z1_Z 1 0 0 5
z2_Z 3 6 7 7
New data frame
ID1 ID2 ID3 ID4
X x3 x2 x2 x2
Y y2 y2 y1 y2
Z z2 z2 z2 z2
Basically the idea is the following:
For each ID I want to find which of the rownames (x1_X,x2_X,x3_X) has the most extreme value and assign this to name X since in the rownames I have subgroups.
My data frame is huge: 1700 columns and 100000 rows.

First we need to split the group and subgroup labels:
grp <- strsplit(row.names(df), "_")
And if performance is an issue, I think data.table is our best choice:
library(data.table)
df$group <- sapply(grp, "[", 2)
subgroup <- sapply(grp, "[", 1)
dt <- data.table(df)
And we now have access to the single line:
result <- dt[,lapply(.SD, function(x) subgroup[.I[which.max(x)]]), by=group]
Which splits the data.table by the character after the underscore (by=group) and then, for every column of the rectangular subset (.SD) we get the index in the sub-rectangle (which.max), and then map it back to the whole data.table (.I), and then extract the relevant subgroup (subgroup).
The data.table package is meant to be quite efficient, though you might want to look into indexing your data.table if you're going to be querying it multiple times.

Your table:
df <- read.table (text= " ID1 ID2 ID3 ID4
x1_X 0 10 4 7
x2_X 2 12 5 8
x3_X 3 1 3 5
y1_Y 4 13 6 4
y2_Y 5 14 1 9
y3_Y 2 11 1 5
y4_Y 1 1 2 3
z1_Z 1 0 0 5
z2_Z 3 6 7 7", header = T)
Split rownames to get groups:
library(plyr)
df_names <- ldply(strsplit (rownames(df), "_"))
colnames(df_names) <- c ("group1", "group2")
df2 <- cbind (df, df_names)
Create new table:
df_new <- data.frame (matrix(nrow = length(unique (df2$group2)),
ncol = ncol(df)))
colnames(df_new) <- colnames(df)
rownames (df_new) <- unique (df_names[["group2"]])
Filling new table with a loop:
for (i in 1:ncol (df_new)) {
for (k in 1:nrow (df_new)) {
col0 <- colnames (df_new)[i]
row0 <- rownames (df_new)[k]
sub0 <- df2 [df2$group2 == row0, c(col0, "group1")]
df_new [k,i] <- sub0 [sub0[1]==max (sub0[1]), 2]
}
}

Related

How to create a parameter using $ to select from data frame

I have three different data frames that are similar in their columns such:
df1 df2 df3
Class 1 2 3 Class 1 2 3 Class 1 2 3
A 5 3 2 A 7 3 10 A 5 4 1
B 9 1 4 B 2 6 2 A 2 6 2
C 7 9 8 C 4 7 1 A 12 3 8
I would like to iterate through the three files and select the data from the columns with similar name. In other words, I want to iterate three times and everytime select data of column 1, then column 2, and then column 3 and merge them in one data frame.
To do that, I did the following:
df1 <- read.csv(R1)
df2 <- read.csv(R2)
df3 <- read.csv(R3)
df <- data.frame(Class=character(), B1_1=integer(), B1_2=integer(), B1_3=integer(), stringsAsFactors=FALSE)
for(i in 1:3){
nam <- paste("X", i, sep = "") #here I want to call the column name such as X1, X2, and X3
df[seq_along(df1[nam]), ]$B1_1 <- df1[nam]
df[seq_along(df2[nam]), ]$B1_2 <- df2[nam]
df[seq_along(df3[nam]), ]$B1_3 <- df3[nam]
df$Class <- df1$Class
}
In this line df[seq_along(df1[nam]), ]$B1_1 <- df1[nam], I followed the solution from this but this produces the following error:
Error in `$<-.data.frame`(`*tmp*`, "B1_1", value = list(X1 = c(5L, 7L, :
replacement has 10 rows, data has 1
Do you have any idea how to solve it?

merging more than 2 data frames whilst assigning an identifier factor in R

Take this very simple RWE, I want to know what package can be used to automatically assign a factor (preferable the data frame name) when we merge two or more data.frames
I have manually defined the factor in the example below and shown the desired output. But i want to automate it as I have over 100 tables to merge. Note that the headers within each df are constant, only the name itself changes
A <- 1:5
B <- 5:1
df1 <- data.frame(A,B)
A <- 2:6
B <- 6:2
df2 <- data.frame(A,B)
df1$ID <- rep("df1", 5)
df2$ID <- rep("df2", 5)
big_df <- rbind(df1,df2)
Assuming that your data.frame names follow a certain pattern like beginning with "df" followed by numbers and they are not inside a list but simply in your global environment, you can use the following:
library(data.table)
bigdf <- rbindlist(Filter(is.data.frame, mget(ls(pattern = "^df\\d+"))), id = "ID")
Without data.table, you could do it as follows:
lst <- Filter(is.data.frame, mget(ls(pattern = "^df\\d+")))
bigdf <- do.call(rbind, Map(function(df, id) transform(df, ID=id), lst, names(lst)))
Consider the following:
library(dplyr)
cof_df <- bind_rows(df1, df2, .id="ID")
cof_df
ID A B
1 1 1 5
2 1 2 4
3 1 3 3
4 1 4 2
5 1 5 1
6 2 2 6
7 2 3 5
8 2 4 4
9 2 5 3
10 2 6 2
And then:
cof_df$ID <- factor(cof_df$ID,
levels = c(1,2),
labels = paste0("df", unique(cof_df$ID)))
does the recoding.
A similar result can be obtained by naming the arguments in bind_rows, as in
cof_df <- bind_rows(df1=df1, df2=df2, .id="ID")
Another solution will be to use merge:
merged <- merge(df1, df2, all=TRUE, sort =FALSE)
> merged
A B ID
1 1 5 df1
2 2 4 df1
3 3 3 df1
4 4 2 df1
5 5 1 df1
6 2 6 df2
7 3 5 df2
8 4 4 df2
9 5 3 df2
10 6 2 df2

Place on specific positions of dataframes specific values

DF <- data.frame(x1=c(NA,7,7,8,NA), x2=c(1,4,NA,NA,4)) # a data frame with NA
WhereAreMissingValues <- which(is.na(DF), arr.ind=TRUE) # find the position of the missing values
Modes <- apply(DF, 2, function(x) {which(tabulate(x) == max(tabulate(x)))}) # find the modes of each column
DF
WhereAreMissingValues
Modes
I would like to replace the NAs of each column of DF with the mode, accordingly.
Please for some help.
Map provides here a one line solution:
data.frame(Map(function(u,v){u[is.na(u)]=v;u},DF, Modes))
# x1 x2
#1 7 1
#2 7 4
#3 7 4
#4 8 4
#5 7 4
Here's how I would do this.
First I'll define an helper function
Myfunc <- function(x) as.numeric(names(sort(-table(x)))[1L])
Then just use lapply over the data set
DF[] <- lapply(DF, function(x){x[is.na(x)] <- Myfunc(x) ; x})
DF
# x1 x2
# 1 7 1
# 2 7 4
# 3 7 4
# 4 8 4
# 5 7 4

Changing the values of a column for the values from another column

I have two datasets that look like this:
What I want is to change the values from the second column in the first dataset to the values from the second column from the second dataset. All the names in the first dataset are in the second one, and obviously my dataset is much bigger than that.
I was trying to use R to do that but I am very new at it. I was looking at the intersect command but I am not sure if it's going to work. I don't put any codes because I'm real lost here.
I also need that the order of the first columns (which are names) in the first dataset stays the same, but with the new values from the second column of the second dataset.
Agree with #agstudy, a simple use of merge would do the trick. Try something like this:
df1 <- data.frame(name=c("ab23242", "ab35366", "ab47490", "ab59614"),
X=c(72722, 88283, 99999, 114278.333))
df2 <- data.frame(name=c("ab35366", "ab47490", "ab59614", "ab23242" ),
X=c(12345, 23456, 34567, 456789))
df.merge <- merge(df1, df2, by="name", all.x=T)
df.merge <- df.merge[, -2]
Output:
name X.y
1 ab23242 456789
2 ab35366 12345
3 ab47490 23456
4 ab59614 34567
I think merge will keep order of first frame but you can also keep the order strictly by simply adding a column with order df1$order <- 1:nrow(df1) and later on sorting based on that column.
df1<- data.frame( name1 = letters[6:10], valuecol1=seq(2,10,by=2))
df2 <- data.frame( name2 = letters[1:10], valuecol2=10:1)
df2 [ match(df1$name1, df2$name2) , "valuecol2"] <- df1[ df1$name1 %in% df2$name2 , "valuecol1"]
df2
name2 valuecol2
1 a 10
2 b 9
3 c 8
4 d 7
5 e 6
6 f 2
7 g 4
8 h 6
9 i 8
10 j 10
This is what I thought might work, but doing replacements using indexing with match sometimes bites me in ways I need to adjust:
df2 [match(df1$name1, df2$name2) , "valuecol2"] <-
df1[ match(df1$name1, df2$name2) , "valuecol1"]
Here's how I tested it (edited).
> df2 <- data.frame( name2 = letters[1:10], valuecol2=10:1)
> df1<- data.frame( name1 = letters[1:5], valuecol1=seq(2,10,by=2))
> df2 [ match(df1$name1, df2$name2) , "valuecol2"] <- df1[ match(df1$name1, df2$name2) , "valuecol1"]
> df2
name2 valuecol2
1 a 2
2 b 4
3 c 6
4 d 8
5 e 10
6 f 5
7 g 4
8 h 3
9 i 2
10 j 1
Yep.... bitten again.
> df1<- data.frame( name1 = letters[6:10], valuecol1=seq(2,10,by=2))
> df2 [ match(df1$name1, df2$name2) , "valuecol2"] <- df1[ match(df1$name1, df2$name2) , "valuecol1"]
> df2
name2 valuecol2
1 a 2
2 b 4
3 c 6
4 d 8
5 e 10
6 f NA
7 g NA
8 h NA
9 i NA
10 j NA
How about this:
library(data.table)
# generate some random data
dt.1 <- data.table(id = 1:1000, value=rnorm(1000), key="id")
dt.2 <- data.table(id = 2*(500:1), value=as.numeric(1:500), key="id")
# objective is to replace value in df.1 with value from df.2 where id's match.
# data table joins - very efficient
# dt.1 now has 3 columns: id, value, and value.1 from dt.2$value
dt.1 <-dt.2[dt.1,nomatch=NA]
dt.1[is.na(value),]$value=dt.1[is.na(value),]$value.1
dt.1$value.1=NULL # get rid of extra column
NB: This sorts dt.1 by id which should be OK since it's sorted that way already.
Also: In future, please include data that can be imported into R. Images are not useful!

Sum of hybrid data frames depending on multiple conditions in R

This is a more complex follow-up to my previous question. The answer there was to use a matrix, but that doesn't work with data frames having values of different modes.
I want to combine data frames of different sizes, with character and integer columns, and calculate their sum depending on multiple conditions.
Conditions
sums are only calculated for those rows that have a matching "Name"-value
sums are calculated for matching column names only
if a cell in df4 is not 0 and not NA, the sum should be df3 + df4
else the sum should be df1 + df2 + df3
Example
> df1 <- data.frame(Name=c("Joe","Ann","Lee","Dan"), "1"=c(0,1,5,2), "2"=c(3,1,0,0), "3"=c(2,0,2,2), "4"=c(2,1,3,4))
> df1
Name X1 X2 X3 X4
1 Joe 0 3 2 2
2 Ann 1 1 0 1
3 Lee 5 0 2 3
4 Dan 2 0 2 4
> df2 <- data.frame(Name=c("Joe","Ann","Ken"), "1"=c(3,4,1), "2"=c(2,3,0), "3"=c(2,4,3))
> df2
Name X1 X2 X3
1 Joe 3 2 2
2 Ann 4 3 4
3 Ken 1 0 3
> df3 <- data.frame(Name=c("Lee","Ben"), "1"=c(1,3), "2"=c(3,4), "3"=c(4,3))
> df3
Name X1 X2 X3
1 Lee 1 3 4
2 Ben 3 4 3
The condition depends on this frame:
> df4 <- data.frame(Name=c("Lee","Ann","Dan"), "1"=c(6,0,NA), "2"=c(0,0,4), "3"=c(0,NA,0))
> df4
Name X1 X2 X3
1 Lee 6 0 0
2 Ann 0 0 NA
3 Dan NA 4 0
With the above examples, this is the expected result (* values depend on df4):
> dfsum
Name X1 X2 X3 X4
1 Joe 3 5 4 2
2 Ann 5 4 4 1
3 Lee 7* 3 6 3
4 Dan 2 4* 2 4
5 Ken 1 0 3 NA
6 Ben 3 4 3 NA
Possible steps?
First expand df1, df2, df3, df4 to 5 columns and 6 rows, fill missing data with NA.
Then for each data frame:
sort rows by "Name"
separate "Name" column from "X1"..."X4"
transform "X1"..."X4" columns to matrix
calculate sums of the matrices like in the answer to my other question but with the additional condition 1
transform result matrix to data frame
cbind the "Name" column with the result data frame
How can this be done in R?
Solution
#Ricardo Saporta's solution works with little changes:
Add , padValue=NA) in the four addCols().
As answered here, replace the definitions of sumD3D4 and dtsum with:
plus <- function(x) {
if(all(is.na(x))){
c(x[0],NA)} else {
sum(x,na.rm = TRUE)}
}
sumD3D4 <- setkey(rbind(dt3, dt4)[,lapply(.SD, plus), by = Name], "Name")
dtsum <- setkey(rbind(dt1, dt2, dt3)[, lapply(.SD, plus), by=Name], "Name")
If you use data.table instead of data.frame, you could use its by=xxxx feature, to add by name.
The code below should give you your expected results.
Please note that I am padding the data.tables with extra empty columns. However, we compute condTrue prior to then.
library(data.table)
dt1 <- data.table(df1)
dt2 <- data.table(df2)
dt3 <- data.table(df3)
dt4 <- data.table(df4)
# make sure all dt's have the same columns
#-----------------------------------------#
# identify which dt4 satisfy the condition
condTrue <- as.data.table(which(!(is.na(dt4) | dt4==0), arr.ind=TRUE))
# ignore column "Name" from dt4
condTrue <- condTrue[col>1]
# convert from (row, col) index to ("Name", columnName)
condTrue <- data.table(Name=dt4[condTrue$row, Name], colm=names(dt4)[condTrue$col], key="Name")
# First make a list of all the unique column names
allColumnNames <- unique(c(names(dt1), names(dt2), names(dt3), names(dt4)))
# add columns as necessary, using addCols (definted below)
addCols(dt1, allColumnNames)
addCols(dt2, allColumnNames)
addCols(dt3, allColumnNames)
addCols(dt4, allColumnNames)
sumD3D4 <- setkey(rbind(dt3, dt4)[, lapply(.SD, sum), by=Name], "Name")
dtsum <- setkey(rbind(dt1, dt2, dt3)[, lapply(.SD, sum), by=Name], "Name")
for (Nam in condTrue$Name) {
colsRepl <- condTrue[.(Nam)]$colm
valsRepl <- unlist(sumD3D4[.(Nam), c(colsRepl), with=FALSE])
dtsum[.(Nam), c(colsRepl) := as.list(valsRepl)]
}
dtsum
# Name 1 2 3 4
# 1: Ann 5 4 4 1
# 2: Ben 3 4 3 0
# 3: Dan 2 4 2 4
# 4: Joe 3 5 4 2
# 5: Ken 1 0 3 0
# 6: Lee 7 3 6 3
addCols <- function(x, cols, padValue=0) {
# adds to x any columns that are in cols but not in x
# Returns TRUE if columns were added
# FALSE if no columns added
colsMissing <- setdiff(cols, names(x))
# grab the actual DT name that was passed to function
dtName <- as.character(match.call()[2])
if (length(colsMissing)) {
get(dtName, envir=parent.frame(1))[, c(colsMissing) := padValue]
return(TRUE)
}
return(FALSE)
}

Resources