Combine csv files with redundant columns R - r

I have a series of .csv files that look like this :
a.csv contains
id, a, b, c
1, 10, 0, 0
2, 3, 0 , 0
3, 20, 0, 0
b.csv contains
id, a, b, c
1, 0, 7, 0
2, 0, 9, 0
3, 0, 14, 0
c.csv contains
id, a, b, c
1, 0, 0, 12
2, 0, 0, 8
3, 0, 0, 22
I'm trying to figure out the most efficient way to read them in and create a dataframe that looks like this
id, a, b, c
1, 10, 7, 12
2, 3, 9, 8
3, 20, 14, 22
What would be the best way to do this if there are many more files with many more columns and rows? tidyverse is preferred.

How about this. If all redundant columns have zeros, then you can go long, filter out the zeros, bind the rows, and then go wide.
library(tidyverse)
df_a <- read_table("id a b c
1 10 0 0
2 3 0 0
3 20 0 0")
df_b <- read_table("id a b c
1 0 7 0
2 0 9 0
3 0 14 0")
df_c <- read_table("id a b c
1 0 0 12
2 0 0 8
3 0 0 22")
list(df_a, df_b, df_c)|>
map(\(d) pivot_longer(d, cols = -id) |>
filter(value >0)) |>
bind_rows() |>
pivot_wider(names_from = name, values_from = value)
#> # A tibble: 3 x 4
#> id a b c
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 10 7 12
#> 2 2 3 9 8
#> 3 3 20 14 22
Or better yet, read in the data marking 0 as NA and then coalesce the data frames.
df_a <- read_table("id a b c
1 10 0 0
2 3 0 0
3 20 0 0", na = "0")
df_b <- read_table("id a b c
1 0 7 0
2 0 9 0
3 0 14 0", na = "0")
df_c <- read_table("id a b c
1 0 0 12
2 0 0 8
3 0 0 22", na = "0")
coalesce(df_a, df_b, df_c)
#> # A tibble: 3 x 4
#> id a b c
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 10 7 12
#> 2 2 3 9 8
#> 3 3 20 14 22
Or if you can read the data in with NA, you can define 0 as NA:
list(df_a, df_b, df_c) |>
map(\(d) mutate(d, across(everything(), \(x) ifelse(x == 0, NA, x)))) |>
reduce(coalesce)
#> # A tibble: 3 x 4
#> id a b c
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 10 7 12
#> 2 2 3 9 8
#> 3 3 20 14 22

A base R solution, given the symmetry of the files.
Read the files
file_names <- list.files(pattern="^[abc]\\.csv")
lis <- sapply(file_names, function(x) list(read.csv(x, header=T)))
lis
$a.csv
id a b c
1 1 10 0 0
2 2 3 0 0
3 3 20 0 0
$b.csv
id a b c
1 1 0 7 0
2 2 0 9 0
3 3 0 14 0
$c.csv
id a b c
1 1 0 0 12
2 2 0 0 8
3 3 0 0 22
Combine the columns
column_names <- c("a","b","c")
cbind( lis[["a.csv"]]["id"], sapply(lis, function(x) rowSums(x[column_names])) )
id a.csv b.csv c.csv
1 1 10 7 12
2 2 3 9 8
3 3 20 14 22

Related

Create a new dataframe with 1's and 0's from summarized data?

I have the below dataset that I am working with in R:
df <- data.frame(day=seq(1,3,1), tot.infected=c(1,2,4), tot.ind=5)
df
And I would like to transform the tot.infected column into a binomial variable with 1's and 0's, such as the following dataframe:
df2 <- data.frame(year = c(rep(1,5), rep(2,5), rep(3,5)), infected = c(rep(1,1), rep(0,4), rep(1,2), rep(0,3), rep(1,4), rep(0,1)))
Is there a more elegant way to do this in R?
Thank you for your help!
I tried hard-coding a dataframe using rep(), but this is extremely time-consuming for large datasets and I was looking for a more elegant way to achieve this.
base R
tmp <- do.call(Map, c(list(f = function(y, inf, ind) data.frame(year = y, infected = replace(integer(ind), seq(ind) <= inf, 1L))), unname(df)))
do.call(rbind, tmp)
# year infected
# 1 1 1
# 2 1 0
# 3 1 0
# 4 1 0
# 5 1 0
# 6 2 1
# 7 2 1
# 8 2 0
# 9 2 0
# 10 2 0
# 11 3 1
# 12 3 1
# 13 3 1
# 14 3 1
# 15 3 0
dplyr
library(dplyr)
df %>%
rowwise() %>%
summarize(tibble(year = day, infected = replace(integer(tot.ind), seq(tot.ind) <= tot.infected, 1L)))
# # A tibble: 15 x 2
# year infected
# <dbl> <int>
# 1 1 1
# 2 1 0
# 3 1 0
# 4 1 0
# 5 1 0
# 6 2 1
# 7 2 1
# 8 2 0
# 9 2 0
# 10 2 0
# 11 3 1
# 12 3 1
# 13 3 1
# 14 3 1
# 15 3 0
We can do it this way:
library(dplyr)
df %>%
group_by(day) %>%
summarise(cur_data()[seq(unique(tot.ind)),]) %>%
#mutate(x = row_number())
mutate(tot.infected = ifelse(row_number() <= first(tot.infected),
first(tot.infected)/first(tot.infected), 0), .keep="used")
day tot.infected
<dbl> <dbl>
1 1 1
2 1 0
3 1 0
4 1 0
5 1 0
6 2 1
7 2 1
8 2 0
9 2 0
10 2 0
11 3 1
12 3 1
13 3 1
14 3 1
15 3 0
Using rep.int and replace, basically.
with(df, data.frame(
year=do.call(rep.int, unname(df[c(1, 3)])),
infected=unlist(Map(replace, Map(rep.int, 0, tot.ind), lapply(tot.infected, seq), 1))
))
# year infected
# 1 1 1
# 2 1 0
# 3 1 0
# 4 1 0
# 5 1 0
# 6 2 1
# 7 2 1
# 8 2 0
# 9 2 0
# 10 2 0
# 11 3 1
# 12 3 1
# 13 3 1
# 14 3 1
# 15 3 0
Data:
df <- structure(list(day = c(1, 2, 3), tot.infected = c(1, 2, 4), tot.ind = c(5,
5, 5)), class = "data.frame", row.names = c(NA, -3L))

Conditional replacing of a numeric value in dplyr

Dear all I have a data frame that looks like this
df <- data.frame(time=c(1,2,3,4,1,2,3,4,5), type=c("A","A","A","A","B","B","B","B","B"), count=c(10,0,0,1,8,0,1,0,1))
df
time type count
1 1 A 10
2 2 A 0
3 3 A 0
4 4 A 1
5 1 B 8
6 2 B 0
7 3 B 1
8 4 B 0
9 5 B 1
I want to examine each group of types and if I see that one count is 0 then to replace the next count forward in time with 0. I do not count to be resurrected from the zero.
I want my data to looks like this
time type count
1 1 A 10
2 2 A 0
3 3 A 0
4 4 A 0
5 1 B 8
6 2 B 0
7 3 B 0
8 4 B 0
9 5 B 0
If I understood correctly
library(tidyverse)
df <-
data.frame(
time = c(1, 2, 3, 4, 1, 2, 3, 4, 5),
type = c("A", "A", "A", "A", "B", "B", "B", "B", "B"),
count = c(10, 0, 0, 1, 8, 0, 1, 0, 1)
)
df %>%
group_by(type) %>%
mutate(count = if_else(lag(count, default = first(count)) == 0, 0, count))
#> # A tibble: 9 x 3
#> # Groups: type [2]
#> time type count
#> <dbl> <chr> <dbl>
#> 1 1 A 10
#> 2 2 A 0
#> 3 3 A 0
#> 4 4 A 0
#> 5 1 B 8
#> 6 2 B 0
#> 7 3 B 0
#> 8 4 B 0
#> 9 5 B 0
Created on 2021-09-10 by the reprex package (v2.0.1)
You may use cummin function.
library(dplyr)
df %>% group_by(type) %>% mutate(count = cummin(count))
# time type count
# <dbl> <chr> <dbl>
#1 1 A 10
#2 2 A 0
#3 3 A 0
#4 4 A 0
#5 1 B 8
#6 2 B 0
#7 3 B 0
#8 4 B 0
#9 5 B 0
Since cummin is a base R function you may also implement it in base R -
transform(df, count = ave(count, type, FUN = cummin))

R - Grouping rows by matching value then adding rows to matching columns in another data frame

I am trying to add values from one data frame (ex2) to an existing data frame (ex1) based on two different columns. As you can see, there is an ID column in both data frames. But in ex2, each column of ex1 is represented by a different row instead of a column. For each matching ID, I want to add the result from ex2$result to the matching row in ex1 under the appropriate column heading (if ex2$alpha[i] = a then ex2$result[i] gets added to ex1$a[z] where ex2$id[i]=ex1$id[z]). Another complication is that not all of the columns from ex1 will have alpha value in ex2, so those should be set as 'NA'.
ex1 <- data.frame(
id = c(1:20),
a = c(rep(1,5),rep(0,5),rep(NA,10)),
b = c(rep(c(1,0),5),rep(NA,10)),
c = c(rep(c(0,1),5),rep(NA,10)),
d = c(rep(0,5),rep(1,5),rep(NA,10))
)
ex2 <- data.frame(
id = c(rep(11,3),rep(12,3),rep(13,3),
rep(14,2),rep(15,2),
rep(16,4),rep(17,4),rep(18,4),rep(19,4),rep(20,4)),
alpha = c(rep(c('a','b','d'),3),rep(c('a','b'),2),
rep(c('a','b','c','d'),5)),
result = c(rep(c(0,1,1),11))
)
Thanks for your help!
I believe the attached snippet does what you want it to do. But it is hard to know from your toy data if it is feasible to write out the columns a to d in the mutate statement. There surely is a more clever programmatic way to approach this problem.
ex1 <- data.frame(
id = c(1:20),
a = c(rep(1,5),rep(0,5),rep(NA,10)),
b = c(rep(c(1,0),5),rep(NA,10)),
c = c(rep(c(0,1),5),rep(NA,10)),
d = c(rep(0,5),rep(1,5),rep(NA,10))
)
ex2 <- data.frame(
id = c(rep(11,3),rep(12,3),rep(13,3),
rep(14,2),rep(15,2),
rep(16,4),rep(17,4),rep(18,4),rep(19,4),rep(20,4)),
alpha = c(rep(c('a','b','d'),3),rep(c('a','b'),2),
rep(c('a','b','c','d'),5)),
result = c(rep(c(0,1,1),11))
)
library(tidyverse)
ex_2_wide <- pivot_wider(ex2, id_cols = id, names_from = alpha, values_from = result )
joined <- full_join(ex1, ex_2_wide, by = c("id" = "id")) %>%
mutate(a = coalesce(a.x, a.y)) %>%
mutate(b = coalesce(b.x, b.y)) %>%
mutate(c = coalesce(c.x, c.y)) %>%
mutate(d = coalesce(d.x, d.y)) %>%
select(-(a.x:c.y))
joined
#> id a b c d
#> 1 1 1 1 0 0
#> 2 2 1 0 1 0
#> 3 3 1 1 0 0
#> 4 4 1 0 1 0
#> 5 5 1 1 0 0
#> 6 6 0 0 1 1
#> 7 7 0 1 0 1
#> 8 8 0 0 1 1
#> 9 9 0 1 0 1
#> 10 10 0 0 1 1
#> 11 11 0 1 NA 1
#> 12 12 0 1 NA 1
#> 13 13 0 1 NA 1
#> 14 14 0 1 NA NA
#> 15 15 1 0 NA NA
#> 16 16 1 1 0 1
#> 17 17 1 0 1 1
#> 18 18 0 1 1 0
#> 19 19 1 1 0 1
#> 20 20 1 0 1 1
Created on 2021-01-07 by the reprex package (v0.3.0)
EDIT:
If we turn the problem around (we first make long tables, followed by join and merge, then pivot back wide), there is only a single step for merger, no matter how many columns you have.
library(tidyverse)
ex1_long <- pivot_longer(ex1, cols = a:d, names_to = "alpha")
joined <- full_join(ex1_long, ex2, by = c("id" = "id", "alpha" = "alpha")) %>%
mutate(value = coalesce(value, result)) %>% select(-result) %>%
pivot_wider(id_cols = id, names_from = alpha, values_from = value)
joined
#> # A tibble: 20 x 5
#> id a b c d
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 1 0 0
#> 2 2 1 0 1 0
#> 3 3 1 1 0 0
#> 4 4 1 0 1 0
#> 5 5 1 1 0 0
#> 6 6 0 0 1 1
#> 7 7 0 1 0 1
#> 8 8 0 0 1 1
#> 9 9 0 1 0 1
#> 10 10 0 0 1 1
#> 11 11 0 1 NA 1
#> 12 12 0 1 NA 1
#> 13 13 0 1 NA 1
#> 14 14 0 1 NA NA
#> 15 15 1 0 NA NA
#> 16 16 1 1 0 1
#> 17 17 1 0 1 1
#> 18 18 0 1 1 0
#> 19 19 1 1 0 1
#> 20 20 1 0 1 1
Created on 2021-01-07 by the reprex package (v0.3.0)

Cumulative sum of dataframe up to a certain line (row)

I would like to have cumulative sum of following data:
c1 c2 c3
1 3 6 3
2 4 3 2
3 6 2 5
4 1 5 4
5 0 0 0
6 0 0 0
but up to 4th line (row). For example, a following code with produce general cumulative sum of dataframe including all the rows over the columns
library(readxl)
library(xts)
library("xlsx")
library(dplyr)
library(data.table)
library(tidyverse)
D <- structure(list(c1 = c(3, 4, 6, 1, 0, 0), c2 = c(6, 3, 2, 5, 0,
0), c3 = c(3, 2, 5, 4, 0, 0)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
D
csD <- cumsum(D)
csD
resulting with
c1 c2 c3
1 3 6 3
2 7 9 5
3 13 11 10
4 14 16 14
5 14 16 14
6 14 16 14
However, I would like to have:
c1 c2 c3
1 3 6 3
2 7 9 5
3 13 11 10
4 14 16 14
5 0 0 0
6 0 0 0
Thank you in advance. Alan
csD*(D!=0)
c1 c2 c3
1 3 6 3
2 7 9 5
3 13 11 10
4 14 16 14
5 0 0 0
6 0 0 0
Does this work:
> rbind(cumsum(D[1:(min(which(rowSums(D) == 0))-1), ]), cumsum(D[min(which(rowSums(D) == 0)):nrow(D), ]))
# A tibble: 6 x 3
c1 c2 c3
<dbl> <dbl> <dbl>
1 3 6 3
2 7 9 5
3 13 11 10
4 14 16 14
5 0 0 0
6 0 0 0
>
Maybe not the most optimal way but you can define N and use apply() and rbind() like this:
#Code
#Define N
N <- 4
#Compute
newdf <- rbind(apply(D,2,function(x) cumsum(x[1:N])),
D[(N+1):nrow(D),])
Output:
newdf
c1 c2 c3
1 3 6 3
2 7 9 5
3 13 11 10
4 14 16 14
5 0 0 0
6 0 0 0
We can convert the NA to 0 (na_if), get the cumsum and replace the NAwith 0 (replace_na) across all the columns
library(dplyr)
library(tidyr)
D %>%
mutate(across(everything(), ~replace_na(cumsum(na_if(., 0)), 0)))
-output
# A tibble: 6 x 3
# c1 c2 c3
# <dbl> <dbl> <dbl>
#1 3 6 3
#2 7 9 5
#3 13 11 10
#4 14 16 14
#5 0 0 0
#6 0 0 0
Or if we want to specify a row number
D %>%
mutate(across(everything(), ~ case_when(row_number() <=4 ~
cumsum(.), TRUE ~ .)))

Remove last n rows for 0 by id and then tail last n elements by Id in R

final datasetI have a dataset as attached.
I want to remove all Last 0s in
binary by Id, and then select tail 3 for
by id. (Note: for Id 2 only remainng 2
w5 and w6 ). It will be like the final dataset.
Is there any efficient way to do it?
How about something like this?
ID = c(rep(1, 8), rep(2, 4), rep(3, 8))
week = c(2:9, 5:8, 13:20)
binary = c(0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0)
dataset = data.frame(ID, week, binary)
> dataset
ID week binary
1 1 2 0
2 1 3 1
3 1 4 1
4 1 5 1
5 1 6 1
6 1 7 0
7 1 8 0
8 1 9 0
9 2 5 1
10 2 6 1
11 2 7 0
12 2 8 0
13 3 13 1
14 3 14 1
15 3 15 1
16 3 16 0
17 3 17 0
18 3 18 1
19 3 19 0
20 3 20 0
Solution:
# Split dataset by ID
byGroupDf = split(dataset, ID)
# For each grouped dataset, take only rows up till the last row where ID not equal to 0
byGroupFinal = lapply(byGroupDf, function(x) tail(x[1:max(which(x$binary != 0)),], 3))
# Combine the grouped datasets
FinalDf = do.call(rbind, byGroupFinal)
> FinalDf
ID week binary
1.3 1 4 1
1.4 1 5 1
1.5 1 6 1
2.9 2 5 1
2.10 2 6 1
3.16 3 16 0
3.17 3 17 0
3.18 3 18 1

Resources