Keep rows before a date using ids

Keep rows before a date using ids - r

Having two dataframes with dates like this:
df1 <- data.frame(id = c(1,1,1,1,2,2), date=c("2019/12/11 20:30:12", "2019/12/12 09:20:12", "2019/12/12 11:30:40", "2019/12/13 20:12:34", "2019/12/11 12:20:12", "2019/12/11 19:20:12"), values = c(23,4,1,3,4,2))
df2 <- data.frame(id = c(1,2), date = c("2019/12/12 09:20:12", "2019/12/11 19:20:12"))
How is it possible to use the values of dates of the second dataframe to keep rows before this date into the first dataframe?
Example of expected output:
data.frame(id = c(1,1,2,2), date=c("2019/12/11 20:30:12", "2019/12/12 09:20:12", "2019/12/11 12:20:12, "2019/12/11 19:20:12"), values = c(23,4,4,2))

We can do a left_join and then filter
library(dplyr)
library(lubridate)
left_join(df1, df2, by = 'id') %>%
filter(ymd_hms(date.x) <= ymd_hms(date.y)) %>%
select(id, date = date.x, values)
#id date values
#1 1 2019/12/11 20:30:12 23
#2 1 2019/12/12 09:20:12 4
#3 2 2019/12/11 12:20:12 4
#4 2 2019/12/11 19:20:12 2

Related

How can I search in a data frame if all possible combinations exist in another data frame in R using dplyr?

I have two data frames.
The first one that contains all the possible combinations with their corresponding values and looks like this:
first
second
val
A
B
10
A
C
20
A
D
30
B
C
40
B
D
50
C
D
60
H
I
70
The second one that comes from the production line has two columns the date column that has grouped all the variables corresponding to their date and are concatenated:
date
var
2022-01-01
A
2022-02-01
B,C,F,E,G,H,I
I want to find all the combinations in the second data frame and to see if they match with any combinations in the first data frame. If a variable stands alone in the second data frame as A in 2022-01-01 to give me the 0 and otherwise the value of the combination.
Ideally I want the resulting data frame to look like this:
date
comb
val
2022-01-01
A
0
2022-02-01
B,C
40
2022-02-01
H,I
70
How can I do this in R using dplyr?
library(tidyverse)
first = c("A","A","A","B","B","C","H")
second = c("B","C","D","C","D","D","I")
val = c(10,20,30,40,50,60,70)
df1 = tibble(first,second,val);df1
date = c(as.Date("2022-01-01"),as.Date("2022-02-01"))
var = c("A","B,C,F,E,G,H,I")
df2 = tibble(date,var);df2

Using tidyverse:
library(tidyverse)
first = c("A","A","A","B","B","C","H")
second = c("B","C","D","C","D","D","I")
val = c(10,20,30,40,50,60,70)
df1 = tibble(first,second,val);df1
date = c(as.Date("2022-01-01"),as.Date("2022-02-01"))
var = c("A","B,C,F,E,G,H,I")
df2 = tibble(date,var);df2
df2_tidy <- df2 %>%
mutate(first = str_split(var, ","),
second = first) %>%
unnest(first) %>%
unnest(second) %>%
select(-var)
singles <- df2 %>%
filter(!str_detect(var, ",")) %>%
mutate(val = 0) %>%
select(date, comb = var, val)
combs <- df1 %>%
inner_join(df2_tidy, by = c("first", "second")) %>%
mutate(comb = paste(first, second, sep = ",")) %>%
select(date, comb, val)
bind_rows(singles, combs)

Replace values in dataframe based on other dataframe with column name and value

Let's say I have a dataframe of scores
library(dplyr)
id <- c(1 , 2)
name <- c('John', 'Ninaa')
score1 <- c(8, 6)
score2 <- c(NA, 7)
df <- data.frame(id, name, score1, score2)
Some mistakes have been made so I want to correct them. My corrections are in a different dataframe.
id <- c(2,1)
column <- c('name', 'score2')
new_value <- c('Nina', 9)
corrections <- data.frame(id, column, new_value)
I want to search the dataframe for the correct id and column and change the value.
I have tried something with match but I don't know how mutate the correct column.
df %>% mutate(corrections$column = replace(corrections$column, match(corrections$id, id), corrections$new_value))

We could join by 'id', then mutate across the columns specified in the column and replace the elements based on the matching the corresponding column name (cur_column()) with the column
library(dplyr)
df %>%
left_join(corrections) %>%
mutate(across(all_of(column), ~ replace(.x, match(cur_column(),
column), new_value[match(cur_column(), column)]))) %>%
select(names(df))
-output
id name score1 score2
1 1 John 8 9
2 2 Nina 6 7

It's an implementation of a feasible idea with dplyr::rows_update, though it involves functions of multiple packages. In practice I prefer a moderately parsimonious approach.
library(tidyverse)
corrections %>%
group_by(id) %>%
group_map(
~ pivot_wider(.x, names_from = column, values_from = new_value) %>% type_convert,
.keep = TRUE) %>%
reduce(rows_update, by = 'id', .init = df)
# id name score1 score2
# 1 1 John 8 9
# 2 2 Nina 6 7

Filter rows using column values as condition for another dataset

This is pretty tricky. Let's say I have, for example, a first dataset df:
sample id name
1 ID200,ID300,ID299 first
2 ID2,ID123 second
3 ID90 third
And a second dataset df_1:
ids condition
ID200 y
ID300 n
ID299 n
ID2 y
ID123 y
ID90 n
I have to filter, from the first dataset, all the rows in which all ID values satisfy a condition on the second table, like y.
So the filtering in this example should give:
sample id name
2 ID2,ID123 second
I was thinking to use something like:
new_df = df %>%
filter(grepl('ID', id), df_1$condition == 'y')
But obviously I need something different, can you give me some clues?
Edit: As I said in the comment, what happens if I have df's id column populated with other text, like this?
sample id name
1 ID = ID200,ID300,ID299,abcd first
2 ID = ID2,ID123, dfg second
3 ID = ID90, text third

Perhaps a bit inelegant, but this would give you the final condition status of each sample.
library(tidyverse)
df <- tibble(sample = c(1, 2, 3),
id = c("ID200,ID300,ID299", "ID2,ID123", "ID90"),
name = c("first", "second", "third"))
df_1 <- tibble(ids = c("ID200", "ID300", "ID299", "ID2", "ID123", "ID90"),
condition = c("y", "n", "n", "y", "y", "n"))
df2 <- df %>%
mutate(ids = str_split(id, ",")) %>%
unnest() %>%
inner_join(df_1, by = "ids") %>%
group_by(sample) %>%
summarise(condition = min(condition))
You could then join that to the to the original data frame for filtering.
filtered <- inner_join(df, df2, by = "sample") %>%
filter(condition == "y")

I'd start by tidying df so as id contains one observation per row:
library(tidyr)
library(dplyr)
df %>%
separate_rows(id)
sample id name
1 1 ID200 first
2 1 ID300 first
3 1 ID299 first
4 2 ID2 second
5 2 ID123 second
6 3 ID90 third
The same operation, followed by a join with df_1:
df %>%
separate_rows(id) %>%
left_join(df_1, by = c("id" = "ids"))
sample id name condition
1 1 ID200 first y
2 1 ID300 first n
3 1 ID299 first n
4 2 ID2 second y
5 2 ID123 second y
6 3 ID90 third n
Now you can group on sample and filter for cases where the only condition is "y":
new_df <- df %>%
separate_rows(id) %>%
left_join(df_1, by = c("id" = "ids")) %>%
group_by(sample) %>%
filter(condition == "y",
n_distinct(condition) == 1) %>%
ungroup()
Result:
sample id name condition
<int> <chr> <chr> <chr>
1 2 ID2 second y
2 2 ID123 second y
If you really want to transform back to the original format with comma-separated ids in a column:
library(purrr)
new_df %>%
nest(id) %>%
mutate(newid = map_chr(data, ~paste(.$id, collapse = ","))) %>%
select(sample, id = newid, name)
sample id name
<int> <chr> <chr>
1 2 ID2,ID123 second

Computing average over different columns/rows in a list of data.frames

I've a list of 140 elements of type data.frame ('my.list'). I would like to compute 350 averages of certain values ranges in a certain column for a certain set of rows in a certain data.frame (this is a bit cryptic); so, 350 different averages like:
Of data.frame #1, the average of column 'Measure1', row 1:5;
Of data.frame #2, the average of column 'Measure3', row 1:4, etc. etc.
I have another data.frame ('my.dfAverage') which indicates for which data.frame, column and rows it needs the average. I want to write the 350 different averages and standard deviations to this data.frame (so with the columns: 'average_id', 'dataframe_number', 'column_name', 'row_numbers', 'average' and 'st_dev'). Some value ranges have NA's, these values can be dropped for computing the average.
What is the best way to automatically compute the 350 averages and standard deviations from the list of data.frames based on the info in this data.frame? I thought of creating a for-loop (or maybe the lapply function?), but I'm quite new to these functions, so I'm not sure what the way to go is here.
Small reproducible example of my list of data.frames:
my.df1 <- data.frame(ID = c(1:5),
Measure1 = c(2247,2247,1970,1964,1971),
Measure2 = c(2247,2247,NA,1964,1971))
my.df2 <- data.frame(ID = c(1:4),
Measure3 = c(2247,NA,1970,1964),
Measure5 = c(2247,2247,NA,1964))
my.df3 <- data.frame(ID = c(1:4),
Measure6 = c(2247,600,1970,1964),
Measure8 = c(2247,2247,NA,1964))
my.list <- list(list1 = my.df1, list2 = my.df2, list3 = my.df3)
Desired output table for the averages and standard deviation:
my.dfAverage <- data.frame(average_id = c(1:3),
dataframe_number = c(1,2,3),
column_name = c('Measure1','Measure3','Measure6'),
row_numbers = c('1:3','1:4','1:2'),
average = (NA),
st_dev = (NA))

This is a different approach than the one given above: I will use only base r functions: Point to note, ensure the data has stringsAsFactors=FALSE
write a function but ensure you index mylist correctly. then compute the function on this i e f(...,na.rm=T). to write a function using apply:
fun1=function(f){with(my.dfAverage,
mapply(function(x,y,z)
f(x[eval(parse(text=y)),z],na.rm=T),my.list,row_numbers,column_name))}
transform(my.dfAverage,average=fun1(mean),st_dev=fun1(sd))
average_id dataframe_number column_name row_numbers average st_dev
1 1 1 Measure1 1:3 2154.667 159.9260
2 2 2 Measure3 1:4 2060.333 161.6859
3 3 3 Measure6 1:2 1423.500 1164.6049
Data Used:
my.dfAverage <- data.frame(average_id = c(1:3),
dataframe_number = c(1,2,3),
column_name = c('Measure1','Measure3','Measure6'),
row_numbers = c('1:3','1:4','1:2'),
average = (NA),
st_dev = (NA),stringsAsFactors = F)

A solution using tidyverse.
First, expand the my.dfAverage based on row_numbers.
library(tidyverse)
my.dfAverage2 <- my.dfAverage %>%
separate(row_numbers, into = c("start", "end")) %>%
mutate(row_numbers = map2(start, end, `:`)) %>%
unnest() %>%
select(-start, -end) %>%
mutate(row_numbers = as.integer(row_numbers),
dataframe_number = as.integer(dataframe_number))
Second, transform all data frames in my.list and combine them to a single data frame.
my.list.df <- my.list %>%
setNames(1:length(.)) %>%
map_dfr(function(x){
x2 <- x %>%
gather(column_name, value, -ID)
return(x2)
},.id = "dataframe_number") %>%
mutate(ID = as.integer(ID), dataframe_number = as.integer(dataframe_number)) %>%
rename(row_numbers = ID)
Third, merge my.dfAverage2 and my.list.df and calculate the mean and standard deviation. my.dfAverage3 is the final output.
my.dfAverage3 <- my.dfAverage2 %>%
left_join(my.list.df, by = c("dataframe_number", "column_name", "row_numbers")) %>%
group_by(average_id, dataframe_number, column_name) %>%
summarise(row_numbers = paste(min(row_numbers), max(row_numbers), sep = ":"),
average = mean(value, na.rm = TRUE),
st_dev = sd(value, na.rm = TRUE)) %>%
ungroup()
my.dfAverage3
# A tibble: 3 x 6
# average_id dataframe_number column_name row_numbers average st_dev
# <int> <int> <chr> <chr> <dbl> <dbl>
# 1 1 1 Measure1 1:3 2155 160
# 2 2 2 Measure3 1:4 2060 162
# 3 3 3 Measure6 1:2 1424 1165
DATA
my.list is the same as OP's my.list.
my.dfAverage <- data.frame(average_id = c(1:3),
dataframe_number = c(1,2,3),
column_name = c('Measure1','Measure3','Measure6'),
row_numbers = c('1:3','1:4','1:2'))

Remove row below conditionally in dataframe and add values together in R

I have a large dataset with 3 columns: Name, Country, and Sales.
I'd like to sum the Sales column by Names that are both identical and occur consecutively. Then I'd like to remove all rows but the first occurrence of a series, replacing the value of Sales with the series sum.
For example:
Name,Country,Sales
A,V,100
A,W,100
B,X,100
B,Y,100
A,Z,100
Would be reduced to:
Name,Country,Sales
A,V,200
B,X,200
A,Z,100
Anyone got any idea how to do this?

Your data
df <- structure(list(Name = c("A", "A", "B"), Country = c("X", "Y",
"Z"), Sales = c(100L, 100L, 100L)), .Names = c("Name", "Country",
"Sales"), row.names = c(NA, -3L), class = c("data.table", "data.frame"
))
dplyr solution
library(dplyr)
library(data.table)
ans <- df %>%
group_by(rleid(Name)) %>%
summarise(Name = unique(Name), Sales=sum(Sales)) %>%
select(-1)
Output
Name Sales
<chr> <int>
1 A 200
2 B 100
Alternative example
newdf <- rbind(df, data.frame(Name=c("A","A","B","B"),
Country=c("A","B","C","D"),
Sales=c(100,100,100,100)))
ans <- newdf %>%
group_by(rleid(Name)) %>%
summarise(Name = unique(Name), Sales=sum(Sales)) %>%
select(-1)
Output
Name Sales
<fctr> <dbl>
1 A 200
2 B 100
3 A 200
4 B 200

Here's another solution using sqldf:
library(data.table)
df <- fread("Name,Country,Sales
A,V,100
A,W,100
B,X,100
B,Y,100
A,Z,100")
df$rle = rleid(df$Name)
library(sqldf)
sqldf("select min(rowid) as row_names,
Name,
Country,
sum(Sales) as Sales
from df group by rle", row.names = TRUE)
# Name Country Sales
# 1 A V 200
# 3 B X 200
# 5 A Z 100
row.names = TRUE searches for a column named row_names and treats it as row names, so min(rowid) will not show up as a new column if I set it as row_names.

Try this:
require(dplyr)
df %>%
group_by(Series=rleid(Name)) %>%
mutate(Sales = sum(Sales)) %>%
filter(1:n() == 1)
Output:
Name Country Sales Series
1 A V 200 1
2 B X 200 2
3 A Z 100 3
Sample data:
require(data.table)
df <- fread("Name,Country,Sales
A,V,100
A,W,100
B,X,100
B,Y,100
A,Z,100")