I have a data frame with thousands of rows and I need to output the minimum and maximum values of sections of data that belong to the same group and class. What I need is to read the first start value, compare it to the previous value in the end column and if smaller, jump to the next row and so on until the starting value is larger than the previous end value, then output the minimum starting value and the maximun for that section. My data is already ordered by group-class-start-end.
df <- data.frame(group = c("1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"),
class = c("2", "2", "2", "2", "2", "2", "2", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3"),
start = c("23477018","23535465","23567386","24708741","24708741","24708741","48339885","87274","87274","127819","1832772","1832772","1832772","6733569","7005524","7005524","7644572","8095433","8095433","8095433"),
end = c("47341413", "47341413", "47909872","42247834","47776347","47909872","53818713","3161655","3479466","3503792","3503792","4916249","5329014","8089225","12037894","13934484","12037894","12037894","13626119","13934484"))
The output that I want to achieve is:
group class start end
1 1 2 23477018 47909872
2 1 2 48339885 53818713
3 1 3 87274 5329014
4 1 3 6733569 13934484
Any ideas on how to achieve this will be very much appreciated.
I used data.table for this.
My approach was to first change start and end to integers or there will be ordering problems.
Find which rows meet the start > max(all prior ends), then use cumsum to give an increasing sub-group number.
Then it's just a simple min and max by sub-group.
There are no loops to make this as fast as possible.
library(data.table)
df <- data.frame(group = c("1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"),
class = c("2", "2", "2", "2", "2", "2", "2", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3"),
start = c("23477018","23535465","23567386","24708741","24708741","24708741","48339885","87274","87274","127819","1832772","1832772","1832772","6733569","7005524","7005524","7644572","8095433","8095433","8095433"),
end = c("47341413", "47341413", "47909872","42247834","47776347","47909872","53818713","3161655","3479466","3503792","3503792","4916249","5329014","8089225","12037894","13934484","12037894","12037894","13626119","13934484"))
setDT(df)
df[, c('start', 'end') := lapply(.SD, as.integer), .SDcols = c('start', 'end')]
df[, subgrp := cumsum(start > shift(cummax(.SD$end), fill = 0)), keyby = c('group', 'class')]
ans <- df[, .(start = min(start), end = max(end)), keyby = c('group', 'class', 'subgrp')]
ans[, subgrp := NULL][]
group class start end
1: 1 2 23477018 47909872
2: 1 2 48339885 53818713
3: 1 3 87274 5329014
4: 1 3 6733569 13934484
Here is a tidyverse solution:
library(tidyverse)
df <- data.frame(
group = c("1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"),
class = c("2", "2", "2", "2", "2", "2", "2", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3"),
start = c("23477018","23535465","23567386","24708741","24708741","24708741","48339885","87274","87274","127819","1832772","1832772","1832772","6733569","7005524","7005524","7644572","8095433","8095433","8095433"),
end = c("47341413", "47341413", "47909872","42247834","47776347","47909872","53818713","3161655","3479466","3503792","3503792","4916249","5329014","8089225","12037894","13934484","12037894","12037894","13626119","13934484"))
df %>%
group_by(group, class) %>%
mutate(
start = as.integer(start),
end = as.integer(end),
end_lag = lag(end),
larger_flag = case_when(start > end_lag & !is.na(end_lag) ~ 1, TRUE ~ 0),
sub_group = cumsum(larger_flag)) %>%
group_by(group, class, sub_group) %>%
summarise(
start = min(start),
end = max(end),
.groups = 'drop'
) %>%
select(-sub_group)
# A tibble: 4 x 4
group class start max
<chr> <chr> <int> <int>
1 1 2 23477018 47909872
2 1 2 48339885 53818713
3 1 3 87274 5329014
4 1 3 6733569 13934484
Related
I have a large dataset (>15000 cases) with variables of different numbers of values. Here as a much smaller sample data set:
df <- structure(list(year = c("1", "2", "3", "1", "2", "3", "1", "2",
"3", "1", "2", "3", "1", "2", "3", "1", "2", "3", "1", "2", "3",
"1", "2", "3", "1", "2", "3", "1", "2", "3"), var2tab = c("1",
"3", "1", "2", "2", "3", "3", "1", "3", "3", "3", "2", "2", "1",
"2", "1", "1", "2", "3", "3", "1", "1", "2", "2", "2", "3", "1",
"1", "3", "2"), group = c("1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "2", "2", "2", "2", "2", "2", "2", "2", "2",
"2", "2", "3", "3", "3", "3", "3", "3", "3", "3")),
class = "data.frame", row.names = c(NA, -30L))
First I save where in my large dataset the columns to be used are located. This helps with indexing.
var2use <- which(colnames(df) == "var2tab")
var2x <- which(colnames(df) == "group")
In my large dataset are variables with different numbers of values, so I count the number of values and years.
years_unique <- unique(df$year)
x_unique <- unique(df[var2x])
x_unique <- unlist (x_unique)
n_years <- length (years_unique)
n_var2x <- length (x_unique)
Now I create a list of dataframes for each group.
my_list <- NULL
my_list <- list()
for (i in x_unique) {
for (j in years_unique)
{
my_list[[i]] <- filter(df, df[var2x] == i)
}
}
Up to this point, everything works as desired...
"my_list" contains a data record of the values for the individual years for each group. On this basis, I would like to create new data sets by counting the variable "var2tab" for each year in order to be able to calculate the respective percentage values. However, so far I have only been able to do this for all years in total:
my_df <- NULL
my_df <- list()
for (j in years_unique) {
my_df[[j]] <-
count(my_list[[j]][var2use])
my_df[[j]]$var_rel <-
my_df[[j]][, 2] / sum(my_df[[j]][, 2])
my_df[[j]]$group <-
paste0(j)
}
Edit:
Here is a desired output for this sample input:
df_new <- data.frame(group = c("1", "1", "1", "2", "2", "2", "3", "3", "3"), value = c("1", "2", "3"), abs_year1 =c("1", "1", "1", "2", "2", "1", "1", "0", "1"), rel_year1 = c(".25", ".25", ".333", ".5", ".667", ".25", ".5", "0", ".333"), abs_year2 =c("1", "1", "0", "1", "0", "3", "1", "1", "2"), rel_year2 = c(".25", ".25", "0", ".25", "0", ".75", ".5", ".333", ".667"), abs_year3 =c("2", "2", "2", "1", "1", "0", "0", "2", "0"), rel_year3 = c(".5", ".5", ".667", ".25", ".333", "0", "0", ".667", "0"))
Here is a simple way to count the var2tab values grouped by group and year, and then to calculate their relative frequency within each group and year:
library(dplyr)
df %>%
count(group, year, var2tab) %>%
group_by(group, year) %>%
mutate(proportion = n / sum(n))
# # A tibble: 21 × 5
# # Groups: group, year [9]
# group year var2tab n pct
# <chr> <chr> <chr> <int> <dbl>
# 1 1 1 1 1 0.25
# 2 1 1 2 1 0.25
# 3 1 1 3 2 0.5
# 4 1 2 1 1 0.25
# 5 1 2 2 1 0.25
# 6 1 2 3 2 0.5
# 7 1 3 1 1 0.333
# 8 1 3 3 2 0.667
# 9 2 1 1 2 0.5
# 10 2 1 2 1 0.25
# # … with 11 more rows
# # ℹ Use `print(n = ...)` to see more rows
If you want them in a list separate data frames, you can add ... %>% group_split() to the end.
I have two numeric data values in one cell and I want to separate them to find the total and percentage of each column.
Such as:
Organization
Media
1,2
1
1
1,2
2
1
1
1
1
1
1
1
We could try
lapply(df1, function(x) proportions(table(as.numeric(unlist(strsplit(x, ","))))))
or may be
lapply(df1, function(x) addmargins(table(as.numeric(unlist(strsplit(x, ","))))))
data
df1 <- structure(list(Organization = c("1,2", "1", "2", "1", "3", "1",
"1"), Media = c("1", "1,2", "3", "1,4", "4", "1", "2")), class = "data.frame", row.names = c(NA,
-7L))
My full data set is larger but I have this reproducible sample:
structure(list(ID = c("121", "122", "123", "124"), Var1P = c("3",
"1", "3", "3"), Var1C = c("1", "3", "3", "1"), Var2P = c("1",
"1", "1", "1"), Var2P = c("1", "1", "1", "1"), Var3P = c("1",
"1", "1", "1"), Var3C = c("1", "1", "1", "1"), Var4P = c("1",
"1", "1", "1"), Var4C = c("1", "3", "1", "1"), Var5P = c("1",
"1", "3", "1"), Var5C = c("1", "1", "1", "1"), Var6P = c("1",
"1", "1", "1"), Var6C = c("1", "1", "1", "1"), Var7P = c("1",
"1", "1", "1"), Var7C = c("1", "1", "1", "1"), Var8 = c("0",
"1", "1", "1")), row.names = c(84L, 150L, 271L, 303L), class = "data.frame")
I want to subset the data so that only the observations with a score of 3 under Var1P or Var1C and all other columns a score of 2, 1, or 0. I have tried to use the simple subset function:
Data <- subset(Data, Var1P == 3 | Var1C == 3)
But, how can I make this argument even more complex to also tell R to also remove entries with scores of 3 under the other columns?
I thought simply using the following code would work:
Data <- subset(Data, Var1P == 3 | Var1C == 3 & 4:16 == 1 | 4:16 == 0)
It doesn't because R would then be looking at the row numbers, I think. I don't want to type out all of the column names because like I said my full data frame is much larger. I also am trying to avoid loops.
You can divide the data into two sets of columns, select_cols are the columns where you want to select rows with 3 in them and remove_cols are the remaining columns.
We can then select rows with rowSums where select_cols has 3 in it but remove_cols doesn't.
select_cols <- c('Var1P', 'Var1C')
remove_cols <- setdiff(names(Data), select_cols)
Data[rowSums(Data[select_cols] == 3) > 0 & rowSums(Data[remove_cols] == 3) == 0, ]
# ID Var1P Var1C Var2P Var2P Var3P Var3C Var4P Var4C Var5P Var5C Var6P Var6C Var7P Var7C Var8
#84 121 3 1 1 1 1 1 1 1 1 1 1 1 1 1 0
#303 124 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1
I have a large data set that has scenarios like the following:
df <- structure(list(Variable =c("A", "A", "A", "B", "B", "B", "C", "C", "C"),
Day1=c("1", "1", "-1", "1", "1", "-1", "1", "1", "1"),
Day2=c("1", "1", "-1", "1", "1", "-1", "1", "1", "1"),
Day3=c("1", "1", "1", "1", "1", "-1", "1", "1", "1"),
Day4=c("1", "1", "1", "1", "1", "1", "1", "1", "-1"),
Day5=c("1", "1", "1", "1", "1", "1", "1", "1", "-1")),
class= "data.frame", row.names=c(NA, -9L))
I want to create 3 tables. The first keeps all "Variable" duplicates that have 2 or more consecutive negative numbers prior to Day 3 and would look like the following:
df <- structure(list(Variable =c("A", "A", "A"),
Day1=c("1", "1", "-1"),
Day2=c("1", "1", "-1"),
Day3=c("1", "1", "1"),
Day4=c("1", "1", "1"),
Day5=c("1", "1", "1")),
class= "data.frame", row.names=c(NA, -3L))
The second would include all with negatives on the Day 3:
df <- structure(list(Variable =c("B", "B", "B"),
Day1=c("1", "1", "-1"),
Day2=c("1", "1", "-1"),
Day3=c("1", "1", "-1"),
Day4=c("1", "1", "1"),
Day5=c("1", "1", "1")),
class= "data.frame", row.names=c(NA, -3L))
And third, all with 2 or more consecutive numbers prior to what would be Day 6:
df <- structure(list(Variable =c("C", "C", "C"),
Day1=c("1", "1", "-"),
Day2=c("1", "1", "1"),
Day3=c("1", "1", "1"),
Day4=c("1", "1", "-1"),
Day5=c("1", "1", "-1")),
class= "data.frame", row.names=c(NA, -3L))
Here's one solution:
library(dplyr)
# Define a helper-function to count -1's on selected days
sum_days <- function(.df, n) {
vars <- paste0("Day", n)
rowSums(.df[vars] == -1)
}
# Now mark which rows go where:
grouped_df <- df %>%
mutate(
DF1 = sum_days(., 1:2) >= 2,
DF2 = sum_days(., 3) == 1,
DF3 = sum_days(., 1:5) >= 2
) %>%
group_by(Variable) %>%
mutate_at(vars(starts_with("DF")), any) %>%
ungroup()
Now you have two options - you can either leave the data frame as-is, and use the DF1, DF2 and DF3 filters to mark which rows you're using for subsequent analyses, or you can explicitly create your new data frames:
df1 <- grouped_df[grouped_df $DF1,]
df2 <- grouped_df[grouped_df $DF2,]
df3 <- grouped_df[grouped_df $DF3,]
If you aren't familiar with the pipe operator (%>%), it's a way to linearize the code. All it does it call the next function with the result of the previous function as the first argument.
I am looking for a way to search for a specific form of interaction between the levels of the factors constituting the rows of a dataframe.
I have a dataframe, such as this one, in which each column is an individual, and each row an observation:
A B C D E G H I
1 NA "1" "1" "1" "1" NA "1" "1"
2 "2" "1" "2" "1" "1" NA "1" "1"
3 "1" "2" "2" "1" "1" "1" "1" "2"
4 "1" "2" "2" "2" "3" "3" "4" "2"
5 "1" "1" "2" "2" "1" "2" "1" "2"
What I want to detect is the existence (or not) of combination of factor levels, such as for an x:x' and x:y' exists also a combination y:x' and y:y'. For instance here, such a combination exists for rows 2 and 3, wich I can see by using interaction or : :
> df <- structure(c(NA, "2", "1", "1", "1", "1", "1", "2", "2", "1",
"1", "2", "2", "2", "2", "1", "1", "1", "2", "2", "1", "1", "1",
"3", "1", NA, NA, "1", "3", "2", "1", "1", "1", "4", "1", "1",
"1", "2", "2", "2"), .Dim = c(5L, 8L), .Dimnames = list(c("1",
"2", "3", "4", "5"), c("A", "B", "C", "D", "E", "G", "H", "I")))
> interaction(df["2",],df["3",])
[1] 2.1 1.2 2.2 1.1 1.1 <NA> 1.1 1.2
Levels: 1.1 2.1 1.2 2.2
as well as :
> as.factor(df["2",]):as.factor(df["3",])
[1] 2:1 1:2 2:2 1:1 1:1 <NA> 1:1 1:2
Levels: 1:1 1:2 2:1 2:2
But, now, I would like the detection to be done automatically, so that I could put the labels of all the pairs of rows in the dataframe in which such a configuration (x:y, x:y', x':y, x':y') is detected into an edgelist for the network I want to draw afterwards (here, for instance, I would like to add a row "2","3" to the edgelist).
I have found an elaborate way to do that using Perl and regular expressions, but I wondered if there existed a way to do that in R, without using Regexp.
Edit [04/05/2013]
To avoid being unclear, here are more details about the configuration I'm looking for:
let {x,y,...} be observations of the first row
let {x',y',...} be observations of the second row
for interactions ({x,x'} and {x,y'}) does it exists interactions ({y,x'} and {y,y'})
So, to take a few examples, interactions such as:
1:1, 1:2, 2:1, 2:2 (rows 2 and 3)
or
1:1, **2:1**, **2:2**, **3:1**, **3:2**, 4:1 (rows 4 and 5)
would match, but not
1:1,1:2,1:3,1:4, 2:2 (rows 3 and 4)
or
1:1,1:2 (rows 1 and 2)
for instance.
What I have for now is a code that does what I want to do (imitated from a previous Perl script), in a tremendous amount of time (even if I add a while loop to avoid unnecessary comparisons), and using multiple loops and regexp. I was hoping for a less needlessly complicated way of doing this comparison. Here is how I do now:
df <- structure(c(NA, "2", "1", "1", "1", "1", "1", "2", "2", "1",
"1", "2", "2", "2", "2", "1", "1", "1", "2", "2", "1", "1", "1",
"3", "1", NA, NA, "1", "3", "2", "1", "1", "1", "4", "1", "1",
"1", "2", "2", "2"), .Dim = c(5L, 8L), .Dimnames = list(c("1",
"2", "3", "4", "5"), c("A", "B", "C", "D", "E", "G", "H", "I")))
"myfunction" = function(x){
TableVariantes = as.matrix(x) ;
#Creating the edgelist for the network
edgelist = c(character(0),character(0));
TotalVL = nrow(TableVariantes);
for(i in 1:(TotalVL-1)){
VLA = i;
if(!(i+1) > TotalVL){
for(j in (i+1):TotalVL){
VLB = j ;
problematic.configuration = FALSE;
#False until proven otherwise
interactions = interaction(as.factor(TableVariantes[VLA,]):as.factor(TableVariantes[VLB,]),drop=TRUE);
if(nlevels(as.factor(interactions)) > 3){
#More than three configurations, let's go
#Testing every level of the first variant location
for(k in levels(as.factor(TableVariantes[VLA,]))){
# We create the regexp we will need afterwards. Impossible to use variables inside a regex in R.
searchforK = paste(k,":(.+)",sep="")
if (length(grep(searchforK,levels(interactions), ignore.case = TRUE, perl = TRUE)) > 1){
#More than one configuration for this level of the first row
#capturing corresponding observations of the second row
second.numbers = regexec(searchforK,levels(interactions), ignore.case = TRUE)
second.numbers = do.call(rbind,lapply(regmatches(levels(interactions),second.numbers),`[`))
#Interactions with first number other than the one we are testing
invert.matches = grep(searchforK,levels(interactions), ignore.case = TRUE, perl = TRUE, value=TRUE, invert=TRUE)
#listing these alternative first numbers
alternative.first.numbers = regexec("(.+?):.+",levels(as.factor(invert.matches)), ignore.case = TRUE)
alternative.first.numbers = do.call(rbind,lapply(regmatches(levels(as.factor(invert.matches)),alternative.first.numbers),`[`))
#testing each alternative first number
for(l in levels(as.factor(alternative.first.numbers[,2]))){
#variable problems to count the problematic configurations
problems = 0 ;
#with each alternative second number
for(m in levels(as.factor(second.numbers[,2]))){
searchforproblem = paste(l,":",m,sep="");
if(length(grep(searchforproblem,invert.matches,ignore.case = TRUE, perl = TRUE)) > 0){
#if it matches
problems = problems + 1;
}
if(problems > 1){
#If two possibilities at least
problematic.configuration = TRUE;
}
}
}
}
}
}
if(problematic.configuration == TRUE){
edgelist = rbind(edgelist,c(rownames(TableVariantes)[VLA],rownames(TableVariantes)[VLB]));
#adding a new edge to the network of conflicts !
}
}
}
}
return(edgelist);
}
You can use the dput() function to provide example data with your question.
df <- structure(list(A = c("1", "2", "2", "1", "1", "1", NA, "2", "1",
"2"), B = c(NA, "2", "2", "2", "2", "1", "2", "2", "1", NA),
C = c("1", "2", "1", "1", NA, "1", NA, "2", "2", NA), D = c(NA,
NA, "2", "1", NA, "1", NA, "1", "1", NA), E = c(NA, NA, NA,
"2", "1", NA, "1", "2", NA, "1"), H = c(NA, NA, "1", "2",
NA, "1", "2", "2", NA, "1"), I = c(NA, NA, NA, NA, NA, NA,
"1", "1", NA, "2"), J = c("2", "1", "2", "1", "1", "2", NA,
"2", NA, "2"), K = c("1", "1", NA, "1", "2", "1", NA, "1",
"1", "1"), O = c("2", "2", "1", "2", "1", "1", NA, "2", "1",
NA)), .Names = c("A", "B", "C", "D", "E", "H", "I", "J",
"K", "O"), row.names = c(NA, -10L), class = "data.frame")
I assume that you are interested in discovering what pairs of observations (rows) have four unique interaction levels among the individuals (columns). Here is one way to work that out using for loops.
# convert your data frame to a matrix
m <- as.matrix(df)
# create another matrix to store the results
N <- dim(m)[1]
levelsmat <- matrix(NA, nrow=(N*N - N)/2, ncol=3,
dimnames=list(NULL, c("i", "j", "nlevels")))
# go through all possible pairs of observations
# and record the number of unique interactions
count <- 0
for(i in 1:(N-1)) {
for(j in (i+1):N) {
count <- count + 1
int <- interaction(m[i, ], m[j, ], drop=TRUE)
levelsmat[count, ] <- c(i, j, length(levels(int)))
}}
# paired observations that had 4 unique interactions
levelsmat[levelsmat[, "nlevels"]==4, ]