Subset and group dataframe by matching columns and values R - r

I have 2 dataframes, df1 contains a groupID and continuous variables like so:
GroupID Var1 Var2 Var3 Var4
1 20.33115 19.59319 0.6384765 0.6772862
1 31.05899 23.14446 0.5796645 0.7273182
2 24.28984 20.99047 0.6425050 0.6865804
2 22.47856 21.36709 0.6690020 0.6368560
3 21.65817 20.99444 0.6829786 0.6461840
3 23.45899 21.57718 0.6655482 0.6473043
And df2 contains cutoff values (ct) for each variable:
Var1ct Var2ct Var3ct Var4ct
22.7811 20.3349 0.7793 0.4294
What I want to do is, for each variable in df1, find the number of rows where the value is greater than the cutoff value in its associated columnn in df2 and return that number for each groupID, so the output would look like this:
GroupID N-Var1 N-Var2 N-Var3 N-Var4
1 62 78 33 99
2 69 25 77 12
3 55 45 27 62
df1 is ~ 2million rows unevenly distributed by GroupID and 30 variable columns I need the count for, I am just looking for a more effecient way than typing out the same function for all 30 variables.

Here's a way in dplyr:
library(dplyr)
df1 %>%
group_by(GroupID) %>%
summarise(across(everything(), ~ sum(.x > df2[grepl(cur_column(), colnames(df2))][, 1])))
GroupID Var1 Var2 Var3 Var4
<int> <int> <int> <int> <int>
1 1 1 1 0 2
2 2 1 2 0 2
3 3 1 2 0 2
data
df1 <- read.table(header = T, text = "GroupID Var1 Var2 Var3 Var4
1 20.33115 19.59319 0.6384765 0.6772862
1 31.05899 23.14446 0.5796645 0.7273182
2 24.28984 20.99047 0.6425050 0.6865804
2 22.47856 21.36709 0.6690020 0.6368560
3 21.65817 20.99444 0.6829786 0.6461840
3 23.45899 21.57718 0.6655482 0.6473043 ")
df2 <- read.table(header = T, text = "Var1ct Var2ct Var3ct Var4ct
22.7811 20.3349 0.7793 0.4294")

a data.table approach that should scale well..
library(data.table)
# if df1 and dsf2 are not data.table, use
# setDT(df)1; setDT(df2)
# we need similara columnnames in df1 and df2 to easily join
setnames(df2, names(df1)[2:5])
# melt df1 and to long format
df1.long <- melt(df1, id.vars = "GroupID")
df2.long <- melt(df2, measure.vars = names(df2))
# join ct-values
df1.long[df2.long, ct := i.value, on = .(variable)]
# summarise
ans <- df1.long[, sum(value > ct), by = .(GroupID, variable)]
# cast to wide
dcast(ans, GroupID ~ variable, value.var = "V1")
# GroupID Var1 Var2 Var3 Var4
# 1: 1 1 1 0 2
# 2: 2 1 2 0 2
# 3: 3 1 2 0 2
sample data
df1 <- fread("GroupID Var1 Var2 Var3 Var4
1 20.33115 19.59319 0.6384765 0.6772862
1 31.05899 23.14446 0.5796645 0.7273182
2 24.28984 20.99047 0.6425050 0.6865804
2 22.47856 21.36709 0.6690020 0.6368560
3 21.65817 20.99444 0.6829786 0.6461840
3 23.45899 21.57718 0.6655482 0.6473043 ")
df2 <- fread("Var1ct Var2ct Var3ct Var4ct
22.7811 20.3349 0.7793 0.4294")

Related

Reshape from unconstructed dataset in r

I am trying to reshape a dataset by switching some cells information. Here is how my sample dataset looks like.
data <- data.frame(var1 = c("Text","A","B","C","D"),
var2 = c("Text",NA, 1,0,1),
var3 = c("112-1",NA,NA,"text",NA),
var4 = c("Text",1,0,NA, NA),
var5 = c("113-1",NA,"text",NA,NA))
> data
var1 var2 var3 var4 var5
1 Text Text 112-1 Text 113-1
2 A <NA> <NA> 1 <NA>
3 B 1 <NA> 0 text
4 C 0 text <NA> <NA>
5 D 1 <NA> <NA> <NA>
It needs some cleaning first.var1 has the item information. var2 and var4 have score information. var3 and var5 have id information at the first row.
I will need to reshape this dataset as below.
> data.1
id A B C D
1 112 NA 1 0 1
2 113 1 0 NA NA
Considering this datafile in multiple columns (e.g. having more columns var6,var7,var8,var9,.etc) with the same pattern, How can I reshape to this desired dataset?
This isn't much different from my answer yesterday, but this will give you the result you asked for. Shift that first row over one column so that the id is on the same column with the needed values, remove the unnecessary columns, then make row one the column names. Add some pivots and then it should be roughly what you need:
data <- data.frame(var1 = c("Text","A","B","C","D"), var2 = c("Text",NA, 1,0,1), var3 = c("112",NA,NA,NA,NA), var4 = c("Text",1,0,NA, NA), var5 = c(113,NA,NA,NA,NA))
library(dplyr)
library(tidyr)
data2<-data%>%
mutate_all(as.character) #Making character to avoid factor issues
data2[1, 2:(ncol(data2) - 1)] <- data2[1, 3:ncol(data2)] #Shifting first row over one column
data3<-data2%>%
select(-var3,-var5) #Removing the uneeded columns
colnames(data3) <- data3[1,] #Taking the first row and making it the column names
data3 <- data3[-1, ] #removing row 1, since it was made into column names
data3%>%
tidyr::pivot_longer(-Text, names_to = "id", values_to = "time")%>% #Making the data into longer format
tidyr::pivot_wider(names_from = Text, values_from = time) #Then back into wide
You could shift the first row, delete, columns %% 2 and transpose.
data[1, ] <- data[1, -1]
data <- data[c(TRUE, seq_len(ncol(data))[-1] %% 2 == 0)]
setNames(as.data.frame(t(data[, -1]), row.names=FALSE), c('id', data[[1]][-1])) |>
type.convert(as.is=TRUE)
# id A B C D
# 1 112-1 NA 1 0 1
# 2 113-1 1 0 NA NA
BTW, how do you get such data? Maybe you have an x-y-problem.
library(dplyr)
library(tidyr)
library(stringr)
#First rename the columns to more appropriate
n = 2 #Number of pairs of columns you have (here 2)
nam <- do.call(paste0, (expand.grid(c("n_", "id_"), seq(n))))
colnames(data) <- c("col", nam)
#Then, the data manipulation
data %>%
mutate(across(starts_with("id"), ~ first(str_remove(.x, "-")))) %>%
fill(starts_with("id")) %>%
slice(-1) %>%
pivot_longer(-col, names_to = c(".value", "rn"), names_sep = "_") %>%
pivot_wider(names_from = "col", values_from = 'n') %>%
select(-rn)
id A B C D
1 1121 NA 1 0 1
2 1131 1 0 NA NA

Count missing values per class

I am looking to check the pattern of missing values according to a class label (dependent variable) in my data. The output I want is the class labels and the number of missing values in the class.
library(tidyverse)
fakeData <- data.frame(var1 = c(1,2,NA,4,NA,6,7,8,9,10),
var2=c(11,NA,NA,14,NA,16,17,NA,19,NA),
Class = c(rep("A", 5), rep("B", 5)))
fakeData %>% group_by(Class) %>% summarize(numMissing = sum(is.na()))
Error in summarise_impl(.data, dots) :
Evaluation error: 0 arguments passed to 'is.na' which requires 1.
What is wrong with my approach here?
I think this is a cleaner solution, using tidyverse only. You don't need to know the number of columns. You can also use ?select_helpers in gather() to select columns, eg. starts_with("var").
fakeData %>%
group_by(Class) %>%
gather(variable, value, -Class) %>% # all except Class
summarise(missing_n = sum(is.na(value)))
# A tibble: 2 x 2
Class missing_n
<fctr> <int>
1 A 5
2 B 2
Perhaps, we can do
fakeData %>%
group_by(Class) %>%
summarise_all(funs(sum(is.na(.)))) %>%
transmute(Class, numMissing = var1 + var2)
If we have many columns, then use purrr::reduce
fakeData %>%
group_by(Class) %>%
summarise_all(funs(sum(is.na(.)))) %>%
transmute(Class, numMissing = .[-1] %>% reduce(`+`))
#or with rowSums
#transmute(Class, numMissing = rowSums(.[-1]))
I would suggest melting dataset in long format using reshape lib. Then just use aggregate function by Class variable.
library(reshape)
fakeData <- data.frame(var1 = c(1,2,NA,4,NA,6,7,8,9,10),
var2=c(11,NA,NA,14,NA,16,17,NA,19,NA),
Class = c(rep("A", 5), rep("B", 5)))
fData <- melt(fakeData, measure.vars = c("var1", "var2"))
fData
Class variable value
1 A var1 1
2 A var1 2
3 A var1 NA
4 A var1 4
5 A var1 NA
6 B var1 6
7 B var1 7
8 B var1 8
9 B var1 9
10 B var1 10
11 A var2 11
12 A var2 NA
13 A var2 NA
14 A var2 14
15 A var2 NA
16 B var2 16
17 B var2 17
18 B var2 NA
19 B var2 19
20 B var2 NA
with(fData, aggregate(value, list(Class), function(x) { sum(is.na(x)) }))
Group.1 x
1 A 5
2 B 2

R Table with variables x levels

I have a dataframe with multiple variables, each has values of TRUE, FALSE, or NA. I'm trying to summarize the data, but get anything to work quite the way I want.
names <- c("n1","n2","n3","n4","n5","n6")
groupname <- c("g1","g2","g3","g4","g4","g4")
var1 <- c(TRUE,TRUE,NA,FALSE,TRUE,NA)
var2 <- c(FALSE,TRUE,NA,FALSE,TRUE,NA)
var3 <- c(FALSE,TRUE,NA,FALSE,TRUE,NA)
df <- data.frame(names,groupname,var1,var2,var3)
I'm trying to summarize the data for individual groups:
G4 TRUE FALSE NA
var1 3 1 2
var2 2 2 2
var3 2 2 2
I can do table(groupname,var1) to do them individually, but I'm trying to get it all in a single table. Any suggestions?
using dplyr
library(dplyr)
df %>% gather("key", "value", var1:var3) %>%
group_by(key) %>%
summarise(true = sum(value==TRUE, na.rm=T),
false = sum(!value, na.rm=T),
missing = sum(is.na(value)))
# key true false missing
#1 var1 3 1 2
#2 var2 2 2 2
#3 var3 2 2 2
In base R, you could use table to get the counts, lapply to run through the variables, and do.call to put the results together. A minor subsetting with [ orders the columns as desired.
do.call(rbind, lapply(df[3:5], table, useNA="ifany"))[, c(2,1,3)]
TRUE FALSE <NA>
var1 3 1 2
var2 2 2 2
var3 2 2 2
This will work if each variable has all levels (TRUE, FALSE, NA). If one of the levels is missing, you can tell table to fill it with a 0 count by feeding it a factor variable.
Here is an example.
# expand data set
df$var4 <- c(TRUE, NA)
do.call(rbind, lapply(df[3:6],
function(i) table(factor(i, levels=c(TRUE, FALSE, NA)),
useNA="ifany")))[, c(2,1,3)]
FALSE TRUE <NA>
var1 1 3 2
var2 2 2 2
var3 2 2 2
var4 0 3 3

drop levels of factor for which there is one missing value for one column r

I would like to drop any occurrence of a factor level for which one row contains a missing value
Example:
ID var1 var2
1 1 2
1 NA 3
2 1 2
2 2 4
So, in this hypothetical, what would be left would be:
ID var1 var2
2 1 2
2 2 4
Hers's possible data.table solution (sorry #rawr)
library(data.table)
setDT(df)[, if (all(!is.na(.SD))) .SD, ID]
# ID var1 var2
# 1: 2 1 2
# 2: 2 2 4
If you only want to check var1 then
df[, if (all(!is.na(var1))) .SD, ID]
# ID var1 var2
# 1: 2 1 2
# 2: 2 2 4
Assuming that NAs would occur in both var columns,
df[with(df, !ave(!!rowSums(is.na(df[,-1])), ID, FUN=any)),]
# ID var1 var2
#3 2 1 2
#4 2 2 4
Or if it is only specific to var1
df[with(df, !ave(is.na(var1), ID, FUN=any)),]
# ID var1 var2
#3 2 1 2
#4 2 2 4
Or using dplyr
library(dplyr)
df %>%
group_by(ID) %>%
filter(all(!is.na(var1)))
# ID var1 var2
#1 2 1 2
#2 2 2 4
data
df <- structure(list(ID = c(1L, 1L, 2L, 2L), var1 = c(1L, NA, 1L, 2L
), var2 = c(2L, 3L, 2L, 4L)), .Names = c("ID", "var1", "var2"
), class = "data.frame", row.names = c(NA, -4L))
Here's one more option in base R. It will check all columns for NAs.
df[!df$ID %in% df$ID[rowSums(is.na(df)) > 0],]
# ID var1 var2
#3 2 1 2
#4 2 2 4
If you only want to check in column "var1" you can do:
df[!with(df, ID %in% ID[is.na(var1)]),]
# ID var1 var2
#3 2 1 2
#4 2 2 4
In the current development version of data.table, there's a new implementation of na.omit for data.tables, which takes a cols =and invert = arguments.
The cols = allows to specify the columns on which to look for NAs. And invert = TRUE returns the NA rows instead, instead of omitting them.
You can install the devel version by following these instructions. Or you can wait for 1.9.6 on CRAN at some point. Using that, we can do:
require(data.table) ## 1.9.5+
setkey(setDT(df), ID)
df[!na.omit(df, invert = TRUE)]
# ID var1 var2
# 1: 2 1 2
# 2: 2 2 4
How this works:
setDT converts data.frame to data.table by reference.
setkey sorts the data.table by the columns provided and marks those columns as key columns so that we can perform a join.
na.omit(df, invert = TRUE) gives just those rows that have NA anywhere.
X[!Y] does an anit-join by joining on the key column ID, and returns all the rows that don't match ID = 1 (from Y). Check this post to read in detail about data.table's joins.
HTH

Removing rows when flipped in two columns

Considering the following data frame:
df <- data.frame(var1 = 1:5, var2 = c(5,6,7,8,1))
> df
var1 var2
1 1 5
2 2 6
3 3 7
4 4 8
5 5 1
I'd like to remove all rows whose values are flipped across the two columns. In this case, it would be row 1 and row 5 as the values 1 and 5 in row 1 are flipped to 5 and 1 in row 5. These two rows should be removed.
I hope it came clear what I am asking for :-)
Kind regards!
Perhaps something like this could work too:
df <- data.frame(var1 = 1:5, var2 = c(5,6,7,8,1))
df[!do.call(paste, df) %in% do.call(paste, rev(df)), ]
var1 var2
2 2 6
3 3 7
4 4 8
I'd have to test it on a few more test cases though, but the general idea is to use rev to reverse the order of the columns in "df" and paste them together and compare that with the pasted columns from "df".
Here's a simple but not especially elegant way: make a reversed data frame with a flag, and then merge it on to df:
# Make a reversed dataset
fd <- data.frame(var1 = df$var2, var2 = df$var1, flag = TRUE)
# Merge it onto your original df, then drop the matched rows and the flag var
df.sub <- subset(merge(x = df, y = fd, by = c("var1", "var2"), all.x = TRUE),
subset = is.na(flag),
select = c("var1", "var2"))
Using a bit of maths - the two rows are the same up to a permutation if the sum and absolute value of difference are the same:
df[with(df, !duplicated(data.frame(var1 + var2, abs(var1 - var2)), fromLast = TRUE)),]
# var1 var2
#1 1 5
#2 2 6
#3 3 7
#4 4 8
edit: should've read the question more carefully, to remove both duplicates, follow Ananda's suggestion:
df.ind = with(df, data.frame(var1 + var2, abs(var1 - var2)))
df[!duplicated(df.ind) & !duplicated(df.ind, fromLast = TRUE),]
# var1 var2
#2 2 6
#3 3 7
#4 4 8
If creating a copy doesn't cause memory issues then this works as well -
df <- data.frame(var1 = 1:5, var2 = c(5,6,7,8,1))
df2 <- data.frame(var12 = 1:5, var22 = c(5,6,7,8,1))
df3 <- merge(df,df2, by.x = 'var2', by.y = 'var12', all.x = TRUE)
df3 <- subset(
df3,
is.na(var22),
select = c('var1','var2')
)
Output:
> df3
var1 var2
3 2 6
4 3 7
5 4 8
I tried merging df with df but that gives a warning about the column var2 being duplicated. Anybody know what to do?
If you can assume there are no duplicates in the data frame. Here's a one line answer, but still not too concise:
df[!duplicated(rbindlist(list(df,df[,2:1])))[nrow(df) + 1:nrow(df)],]
## var1 var2
## 2 2 6
## 3 3 7
## 4 4 8
rbindlist is necessary here because rbind(df,df[,2:1]) will match by column name rather than index, so the other option is something like rbind(df,setnames(df[,2:1],names(df))). If you want to keep duplicates from the original, this gets even more unpleasant:
> df <- data.frame(var1 = 1:5, var2 = c(5,6,7,8,1))
> df<-rbind(df,c(2,6))
> df[!duplicated(rbindlist(list(df,df[,2:1])))[nrow(df)+1:nrow(df)],]
var1 var2
2 2 6
3 3 7
4 4 8
> df[!duplicated(rbindlist(list(df,df[,2:1])))[nrow(df)+1:nrow(df)] | duplicated(df),]
var1 var2
2 2 6
3 3 7
4 4 8
6 2 6

Resources