Retrieve values of the data.frame by matching ID and column name - r

I have a dataframe named df1 which has four columns (i.e. id, s, date and value). The value column is empty and I want to fill it using a second dataframe that is named df2. df2 is filled with id column and many other columns that are named using dates which they belong. All I need is to find corresponding values of df1$value in df2, where both dates and id numbers are matching.
Example data:
set.seed(123)
#df1
df1 <- data.frame(id = 1:100,
s = runif(100,100,1000),
date = sample(seq(as.Date('1999/01/01'), as.Date('2001/01/01'), by="day"), 100),
value = NA)
#df2
df2 <- data.frame(matrix(runif(80000,1,100), ncol=800, nrow=100))[-1]
names(df2) <- seq(as.Date("1999-01-01"),as.Date("2002-12-31"),1)[c(1:799)]
df2 <- cbind(id = 1:100, df2)

One way is to convert df2 into long format using gather and then do left_join
library(dplyr)
library(tidyr)
df1 %>%
left_join(df2 %>%
gather(date, value, -id) %>%
mutate(date = as.Date(date)), by = c("id", "date"))
# id s date value
#1 1 359 2000-03-15 48.32
#2 2 809 1999-09-01 62.16
#3 3 468 1999-12-23 16.41
#4 4 895 2000-11-26 32.70
#5 5 946 1999-12-18 5.84
#6 6 141 2000-10-09 74.65
#7 7 575 2000-10-25 9.22
#8 8 903 2000-03-17 6.46
#9 9 596 1999-10-25 73.48
#10 10 511 1999-04-17 62.43
#...
data
set.seed(123)
df1 <- data.frame(id = 1:100,
s = runif(100,100,1000),
date = sample(seq(as.Date('1999/01/01'), as.Date('2001/01/01'), by="day"), 100))
df2 <- data.frame(matrix(runif(80000,1,100), ncol=800, nrow=100))[-1]
names(df2) <- seq(as.Date("1999-01-01"),as.Date("2002-12-31"),1)[c(1:799)]
df2 <- cbind(id = 1:100, df2)

You can also use melt and then left join using both the keys:
library(dplyr)
library(reshape2)
set.seed(123)
#df1
df1 <- data.frame(id = 1:100,
s = runif(100,100,1000),
date = sample(seq(as.Date('1999/01/01'), as.Date('2001/01/01'), by="day"), 100),
value = NA)
#df2
df2 <- data.frame(matrix(runif(80000,1,100), ncol=800, nrow=100))[-1]
names(df2) <- seq(as.Date("1999-01-01"),as.Date("2002-12-31"),1)[c(1:799)]
df2 <- cbind(id = 1:100, df2)
df2<-melt(df2, id.vars = "id", value.name = "Value", variable.name = "date")
df2$date<-as.Date(df2$date, format = "%Y-%m-%d")
df1<-left_join(df1, df2, by = c("id", "date"))
head(df1)
id s date value Value
1 1 358.8198 2000-03-15 NA 48.31799
2 2 809.4746 1999-09-01 NA 62.15760
3 3 468.0792 1999-12-23 NA 16.41291
4 4 894.7157 2000-11-26 NA 32.70024
5 5 946.4206 1999-12-18 NA 5.83607
6 6 141.0008 2000-10-09 NA 74.64832

We can use efficient way with data.table join. It should be fast for big datasets
library(data.table)
setDT(df1)[melt(setDT(df2), id.var = 'id')[,
date := as.IDate(variable, '%Y-%m-%d')], on = .(id, date)]

Related

How to do SUMIFS in R

Data:
set.seed(42)
df1 = data.frame(
Date = seq.Date(as.Date("2018-01-01"),as.Date("2018-01-30"),1),
value = sample(1:30),
Y = sample(c("yes", "no"), 30, replace = TRUE)
)
df2 = data.frame(
Date = seq.Date(as.Date("2018-01-01"),as.Date("2018-01-30"),7)
)
I want for each date in df2$Date calculate the sum of df1$Value if date in df1$Date falls within df2$Date and df2$Date+6
Inshort I need to calculate weekly sums
Using data.table, create a range start/end, then merge on overlap, then get sum over group:
library(data.table)
df1$start <- df1$Date
df1$end <- df1$Date
df2$start <- df2$Date
df2$end <- df2$Date + 6
setDT(df1, key = c("start", "end"))
setDT(df2, key = c("start", "end"))
foverlaps(df1, df2)[, list(mySum = sum(value)), by = Date ]
# Date mySum
# 1: 2018-01-01 138
# 2: 2018-01-08 96
# 3: 2018-01-15 83
# 4: 2018-01-22 109
# 5: 2018-01-29 39
Check out library lubridate and dplyr, those two are quiet common.
library(lubridate)
library(dplyr)
df1$last_week_day <- ceiling_date(df1$Date, "week") + 1
df1 %>% group_by(last_week_day) %>% summarize(week_value = sum(value))
We can use fuzzyjoin
library(dplyr)
library(fuzzyjoin)
df2$EndDate <- df2$Date+6
fuzzy_left_join(
df1, df2,
by = c(
"Date" = "Date",
"Date" = "EndDate"
), match_fun = list(`>=`, `<=`)) %>%
group_by(Date.y) %>% summarise(Sum=sum(value))
# A tibble: 5 x 2
Date.y Sum
<date> <int>
1 2018-01-01 138
2 2018-01-08 96
3 2018-01-15 83
4 2018-01-22 109
5 2018-01-29 39

Rename Columns of dataframe based on names of list in R

I have multiple dataframes saved in a list object. They share the same two column names. I'd like to rename the second column to the name of the dataframe.
Example Data:
df1 <- data.frame(A = 1:10, B= 11:20)
df2 <- data.frame(A = 21:30, B = 31:40)
df3 <- data.frame(A = 31:40, B= 41:50)
df4 <- data.frame(A = 51:80, B = 61:70)
listDF <- list(df1, df2,df3, df4)
I'm trying to use lapply to rename the second column to match the name of the dataframe.
# trying to rename second column after the element of the list they're located in
listDF_2 <- lapply(names(listDF), function(x) setNames(listDF[[x]], x) )
You may like to use dplyr::bind_rows in this case. It simplifies using name of the data.frames as a new column in combined data frame.
# Create list as.
listDF <- list(df1 = df1, df2 = df2,df3 = df3, df4 = df4)
library(dplyr)
# Now combine all data frames. The name of data frame will be in 'DF_Name' column
bind_rows(listDF, .id = "DF_Name")
# DF_Name A B
# 1 df1 1 11
# 2 df1 2 12
# 3 df1 3 13
# 4 df1 4 14
# 5 df1 5 15
# 6 df1 6 16
# 7 df1 7 17
# 8 df1 8 18
# 9 df1 9 19
# 10 df1 10 20
# 11 df2 21 31
# 12 df2 22 32
# 13 df2 23 33
#.................
#.................
# 58 df4 78 68
# 59 df4 79 69
# 60 df4 80 70
Note: As #Moody_Mudskippe has pointed out that one can simply use
listDF <- lst(df1, df2, df3, df4)
and then use dplyr::bind_rows.
To keep track of names, you can use:
listDF <- list(df1 = df1, df2 = df2, df3 = df3, df4 = df4)
Then you can use for loop:
for (i in names(listDF)){
colnames(listDF[[i]]) <- c("A", i)
}
Or if you need to use lapply, you may use this:
newDF <- lapply(names(listDF), function(x){
colnames(listDF[[x]]) <- c("A", x)
listDF[[x]]
})
names(newDF) <- names(listDF)

Joining dataframes in R (1:n relation)

I have a data.frame df1
df1 <- data.frame(id=1:10)
and I have a second data.frame df2
df2 <- data.frame(id=1:100, key=sample(1:10,100,replace=T), var1=sample(c(TRUE, FALSE),100, replace=T), var2=sample(c("X", "Y"),100, replace=T))
Variable df2$key is a secondary key and points to the variable df1$id.
Now for each entry in df1 I would like to check how many entries there are in df2, given a certain condition.
An example:
If df1$id==5 I would like to create a variable df1$count that counts the number of entries in data.frame df2 where df2$key==5 and df2$var==TRUE.
Thank you for your help!
Here's how you could do it in base R:
merge(df1, aggregate(var1 ~ key, df2, FUN = sum),
by.x = "id", by.y = "key", all.x = TRUE)
id var1
1 1 3
2 2 1
3 3 4
4 4 6
5 5 9
6 6 4
7 7 5
8 8 7
9 9 4
10 10 3
or using dplyr:
library(dplyr)
df2 %>%
filter(var1) %>%
count(key) %>%
right_join(df1, by = c("key" = "id"))
In both cases we do the counting first and then merge the result to df1.

Replace a part of dataframe with new data

I have data1 and data2, and I need data3, that replaces certain regions of data1 with data2.
I use this method to update the data, but actually several columns need to be updated and it would be tedious.
Do you know a more simple way?
library(tidyverse)
library(lubridate)
data1 <- tibble(date=date("2017-11-1") + c(1:10),
a=sample(100,10),b=sample(100,10))
data2 <- tibble(date=date("2017-11-1") + c(1:8),
a=sample(100,8))
data_bind <- left_join(data1, data2, by=("date"))
data_bind$a.x[!is.na(data_bind$a.y)] <- data_bind$a.y[!is.na(data_bind$a.y)]
data_bind %>% select(-a.y) %>% dplyr::rename(a=a.x)
In my opinion, the data.table-package is better suited for such a task. Using:
# create a vector with names from 'data2' that are not used to join by
nms <- names(data2)[-1]
# load the 'data.table'-package
library(data.table)
# convert the dataframes to data,table's
setDT(data1)
setDT(data2)
# join and update the column in 'data1' with the matching values from 'data2'
data1[data2, on = 'date', (nms) := mget(paste0('i.',nms))][]
gives:
date a b
1: 2017-11-02 21 11
2: 2017-11-03 22 12
3: 2017-11-04 23 13
4: 2017-11-05 24 14
5: 2017-11-06 25 15
6: 2017-11-07 26 16
7: 2017-11-08 27 17
8: 2017-11-09 28 18
9: 2017-11-10 9 19
10: 2017-11-11 10 20
What this does:
With setDT(data1) you convert the dataframes/tibbles to a data.table.
With data1[data2, on = 'date'] you can do a join the data.table-way.
By adding (nms) := mget(paste0('i.',nms)) to the join, you tell data.table to update the columns in data1 with the columns that are also present in data2 only where the dates match.
As an alternative approach you could also reshape both datasets into long format and then do the join:
library(data.table)
melt(data1, id = 'date')[melt(data2, id = 'date')
, on = .(date, variable)
, value := i.value
][, dcast(.SD, date ~ variable)]
A translation of this approach to the tidyverse:
library(dplyr)
library(tidyr)
gather(data1, key, value, -1) %>%
left_join(., gather(data2, key, value, -1), by = c('date','key')) %>%
mutate(value.x = ifelse(!is.na(value.y), value.y, value.x)) %>%
select(date, key, value = value.x) %>%
spread(key, value)
Both will give you the same output.
Used data:
data1 <- data.frame(date = as.Date("2017-11-1") + c(1:10), a = 1:10, b = 11:20)
data2 <- data.frame(date = as.Date("2017-11-1") + c(1:8), a = 21:28)

How to merge columns with different names using "merge, by=Column.name" function?

# Data1
SampleID <- c("A-01","B-01","C-01")
Value <- c(1,2,3)
data1 <- data.frame(SampleID, Value)
# Data2
SampleID <- c("A","B","C")
Value1 <- c(3,4,5)
data2 <- data.frame(SampleID,Value1)
# Output : What I want is the following using:
merge(data1, data2, by=c("SampleID"), all = TRUE)
SampleID Value Value1
A-01 1 3
B-01 2 4
C-01 3 5
You can first split SampleID from data1 and then concatenate it.
SampleID <- c("A-01","B-01","C-01")
Sample <- substr(SampleID,1,1)
Num <- substr(SampleID,3,5)
Value <- c(1,2,3)
data1 <- data.frame(Sample ,Num, Value )
SampleID <- c("A","B","C")
Value1 <- c(3,4,5)
data2 <- data.frame(SampleID, Value1)
merged <- merge(data1, data2, by.x = "Sample", by.y = "SampleID", all = T )
merged$SampleID <- paste(merged$Sample,merged$Num, sep = "-")
merged <- merged[,c(5,3,4)]
SampleID Value Value1
1 A-01 1 3
2 B-01 2 4
3 C-01 3 5
I believe the following does what you need.
data1$NewID <- gsub("[^[:alpha:]]", "", data1$SampleID)
result <- merge(data1, data2, by.x = "NewID", by.y = "SampleID", all = TRUE)
result <- result[-1]
result
# SampleID Value Value1
#1 A-01 1 3
#2 B-01 2 4
#3 C-01 3 5
You can then remove the extra column from data1 with
data1 <- data1[-3]
You can do it using sqldf library:
library(sqldf);
sqldf("SELECT data1.SampledId, data1.Vlaue, data2.Value2 FROM data1 JOIN data2 on data1.SampleID like data1.SampleID + '-%'")
Or using data.table likes the following:
library(data.table)
dt1 <- data.table(data1)
dt2 <- data.table(data2)
dt1[dt2, on = .(grepl(CustomerId, CustomerId)), all = TRUE]
To add to collection, here is a dplyr solution which reads a bit easier:
options(stringsAsFactors = F)
SampleID <-c("A-01","B-01","C-01")
Value <- c(1,2,3)
data1 <- data.frame(SampleID, Value)
SampleID <- c("A","B","C")
Value1 <- c(3,4,5)
data2 <- data.frame(SampleID,Value1)
data1 %>%
mutate(new_id = gsub("[^[:alpha:]]", "", SampleID)) %>%
left_join(., data2, by = c("new_id" = "SampleID")) %>%
select(-new_id)
SampleID Value Value1
1 A-01 1 3
2 B-01 2 4
3 C-01 3 5

Resources