Split the dataset in R [duplicate] - r

This question already has answers here:
Transpose / reshape dataframe without "timevar" from long to wide format
(9 answers)
Closed 6 years ago.
I have a dataset which contains Billno and Product columns in the following format:
Billno Product
1 123
1 176
2 189
3 1
3 2
3 44
3 46
etc
The output should be a table of the form:
Billno Prod1 Prod2 Prod3 Prod4
1 123 176
2 189
3 1 2 44 46
Split function works but the dataset contains more than million records. Is there an efficient way of doing this?

with dplyr:
library(dplyr)
library(tidyr)
bill <- rep(c(1,1,2,3,3,3,3),5)
prod <- rep(c(123,176,189, 1,2,44,46),5)
df <- data.frame(bill=bill, prod=prod)
#determine max product count (number of columns in result)
prodmax <- df %>% group_by(bill) %>% summarise(n = n())
df %>% group_by(bill) %>%
mutate(prodn = paste0("prod",row_number())) %>%
spread(prodn, prod) %>%
#select columns in correct order
select_(.dots = c('bill',paste0('prod',seq(1,max(prodmax$n)))))
results in:
bill prod1 prod2 prod3 prod4
(dbl) (dbl) (dbl) (dbl) (dbl)
1 1 123 176 NA NA
2 2 189 NA NA NA
3 3 1 2 44 46

You can do
df <- read.table(header=T, text="Billno Product
1 123
1 176
2 189
3 1
3 2
3 44
3 46")
lst <- split(df[,-1], df[,1])
lst <- lapply(lst, "length<-", max(lengths(lst)))
df <- as.data.frame(do.call(rbind, lst))
# V1 V2 V3 V4
# 1 123 176 NA NA
# 2 189 NA NA NA
# 3 1 2 44 46
and then
names(df) <- sub("V", "prod", names(df))
df$billno <- rownames(df)

This will also do:
l <- lapply(split(df, df$Billno), function(x) t(x)[2,])
df <- as.data.frame(do.call(rbind, lapply(lapply(l, unlist), "[",
1:(max(unlist(lapply(l, length)))))))
names(df) <- paste('Prod', 1:ncol(df), sep='')
df
Prod1 Prod2 Prod3 Prod4
1 123 176 NA NA
2 189 NA NA NA
3 1 2 44 46

Related

Collapse data frame so NAs are removed

I want to collapse this data frame so NA's are removed. How to accomplish this? Thanks!!
id <- c(1,1,1,2,2,3,4,5,5)
q1 <- c(23,55,7,88,90,34,11,22,99)
df <- data.frame(id,q1)
df$row <- 1:nrow(df)
spread(df, id, q1)
row 1 2 3 4 5
1 23 NA NA NA NA
2 55 NA NA NA NA
3 7 NA NA NA NA
4 NA 88 NA NA NA
5 NA 90 NA NA NA
6 NA NA 34 NA NA
7 NA NA NA 11 NA
8 NA NA NA NA 22
9 NA NA NA NA 89
I want it to look like this:
1 2 3 4 5
23 88 34 11 22
55 90 NA NA 89
7 NA NA NA NA
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
The row should be created on the sequence of 'id'. In addition, pivot_wider would be a more general function compared to spread
library(dplyr)
library(tidyr)
df %>%
group_by(id) %>%
mutate(row = row_number()) %>%
ungroup %>%
pivot_wider(names_from = id, values_from = q1) %>%
select(-row)
-output
# A tibble: 3 × 5
`1` `2` `3` `4` `5`
<dbl> <dbl> <dbl> <dbl> <dbl>
1 23 88 34 11 22
2 55 90 NA NA 99
3 7 NA NA NA NA
Or use dcast
library(data.table)
dcast(setDT(df), rowid(id) ~ id, value.var = 'q1')[, id := NULL][]
1 2 3 4 5
<num> <num> <num> <num> <num>
1: 23 88 34 11 22
2: 55 90 NA NA 99
3: 7 NA NA NA NA
Here's a base R solution. I sort each column so the non-NA values are at the top, find the number of non-NA values in the column with the most non-NA values (n), and return the top n rows from the data frame.
library(tidyr)
id <- c(1,1,1,2,2,3,4,5,5)
q1 <- c(23,55,7,88,90,34,11,22,99)
df <- data.frame(id,q1)
df$row <- 1:nrow(df)
df <- spread(df, id, q1)
collapse_df <- function(df) {
move_na_to_bottom <- function(x) x[order(is.na(x))]
sorted <- sapply(df, move_na_to_bottom)
count_non_na <- function(x) sum(!is.na(x))
n <- max(apply(df, 2, count_non_na))
sorted[1:n, ]
}
collapse_df(df[, -1])

How to bring information from one dataframe to another without losing information for duplicates?

Consider df1:
id=c(1,2,3,4,5,6); n_df2=c(1,1,2,2,3,3);
df1=as.data.frame(cbind(id,n_df2)); df1
where n_df2 is the number of appearances for that id in df2.
id2=c(1,2,3,4,5,6,3,4,5,5,6,6);
value=c(25,35,46,78,12,34,12,33,87,56,11,8);
df2=as.data.frame(cbind(id2,value)); df2
(id2 is equivalent to id)
I want df1 to look like this:
df1$value.1=c(25,35,46,78,12,34)
df1$value.2=c(NA,NA,12,33,87,11);
df1$value.3=c(NA,NA,NA,NA,56,8); df1;
Any help will be very much appreciated!
Thanks.
In base R you could do:
merge(df1, reshape(transform(df2[c('id2','value')],
time = ave(id2, id2, FUN = seq_along)),dir = 'wide', idvar = 'id2'),
by.x = 'id', by.y = 'id2', all.x = TRUE)
id n_df2 value.1 value.2 value.3
1 1 1 25 NA NA
2 2 1 35 NA NA
3 3 2 46 12 NA
4 4 2 78 33 NA
5 5 3 12 87 56
6 6 3 34 11 8
In tidyverse
library(tidyverse)
df1 %>%
left_join(df2 %>%
select(id2, value) %>%
group_by(id2) %>%
mutate(time = row_number()) %>%
pivot_wider(id2, names_from=time, names_prefix='value.', values_from = 'value'),
c('id'='id2'))
id n_df2 value.1 value.2 value.3
1 1 1 25 NA NA
2 2 1 35 NA NA
3 3 2 46 12 NA
4 4 2 78 33 NA
5 5 3 12 87 56
6 6 3 34 11 8

R spread dataframe [duplicate]

This question already has answers here:
Reshape multiple value columns to wide format
(5 answers)
Closed 2 years ago.
IN R language how to convert
data1 into data2
data1 = fread("
id year cost pf loss
A 2019-02 155 10 41
B 2019-03 165 14 22
B 2019-01 185 34 56
C 2019-02 350 50 0
A 2019-01 310 40 99")
data2 = fread("
id item 2019-01 2019-02 2019-03
A cost 30 155 NA
A pf 40 10 NA
A loss 99 41 NA
B cost 185 NA 160
B pf 34 NA 14
B loss 56 NA 22
C cost NA 350 NA
C pf NA 50 NA
C loss NA 0 NA")
I try to use spread、gather、dplyr、apply..... but .....
First get the data in long format and then get it back in wide.
library(tidyr)
data1 %>%
pivot_longer(cols = cost:loss) %>%
pivot_wider(names_from = year, values_from = value)
Note that gather and spread have been retired and replace by pivot_longer and pivot_wider.
Using data.table :
library(data.table)
dcast(melt(data1, c('id', 'year')), id+variable~year, value.var = 'value')
# id variable 2019-01 2019-02 2019-03
#1: A cost 310 155 NA
#2: A pf 40 10 NA
#3: A loss 99 41 NA
#4: B cost 185 NA 165
#5: B pf 34 NA 14
#6: B loss 56 NA 22
#7: C cost NA 350 NA
#8: C pf NA 50 NA
#9: C loss NA 0 NA

Joining dataframes with different dimensions and filling the gaps

I want to join two different dataframes. Those dataframes are of different dimensions. Here are the example datasets,
Main dataset
# Main data
id <- c(rep(1, 3), rep(3, 3), rep(10, 1))
time <- c(201601, 201602, 201603, 201601, 201602, 201603, 201601)
data1 <- c(100, 150, 160, 111, 120, 130, 150)
data2 <- c(5, 6, 9, 3, 2, 1, 0)
dataf1 <- data.frame(id, time, data1, data2)
Dataframe to be joined with the main dataset
# Additional data
id <- c(3, 10, 2)
time <- c(rep(201604, 3))
data2 <- c(20, 30, 11)
dataf2 <- data.frame(id, time, data2)
I want to join these two dataframes, namely, dataf1 and dataf2. I have tried dplyr::full_join(dataf1, dataf2, by = "id") but it's not giving what I want. The expected join should look like this,
However, the final output should include the missing timestamps. The final output should look like this,
Is there any way I can achieve this?
Here is a data.table go at your question
library(data.table)
#create data.tables out of your data.frames
setDT(dataf1)
setDT(dataf2)
#row-bind all your data together
alldata <- rbindlist( list( dataf1, dataf2 ), use.names = TRUE, fill = TRUE )
#get all unique id-time combinations out of your data
DT <- CJ( alldata$id, alldata$time, unique = TRUE)
setnames(DT, names(DT), c("id", "time"))
#join your data to all unique combinataions of id-time
ans <- DT[ alldata, `:=`( data1 = i.data1, data2 = i.data2), on = .(id, time)]
ourput
# id time data1 data2
# 1: 1 201601 100 5
# 2: 1 201602 150 6
# 3: 1 201603 160 9
# 4: 1 201604 NA NA
# 5: 2 201601 NA NA
# 6: 2 201602 NA NA
# 7: 2 201603 NA NA
# 8: 2 201604 NA 11
# 9: 3 201601 111 3
# 10: 3 201602 120 2
# 11: 3 201603 130 1
# 12: 3 201604 NA 20
# 13:10 201601 150 0
# 14:10 201602 NA NA
# 15:10 201603 NA NA
# 16:10 201604 NA 30
As you can see, it (almost) matches your desired output.
I got confused at why you wanted id = 10 & time = 201604 ==> data1 = 30. Why this behaviour, while data1 = NA, and data2 = 30 ?
Of course you can easily exchange data1 with data2 using an ifelse-like solution in like ans[ is.na(data1) & !is.na(data2),:=(data1 = data2, data2 = NA)]
Here is one way using tidyr::complete with dplyr. After doing a full_join, we convert time column to Date object. For every id complete the sequence from the minimum value to '2016-04-01' and remove NA rows.
library(dplyr)
full_join(dataf1, dataf2, by = "id") %>%
select(-time.y, -data2.y) %>%
rename_all(~names(dataf1)) %>%
mutate(time1 = as.Date(paste0(time, "01"), "%Y%m%d")) %>%
tidyr::complete(id, time1 = seq(min(time1, na.rm = TRUE),
as.Date('2016-04-01'), by = "1 month")) %>%
mutate(time = format(time1, "%Y%m")) %>%
filter_at(vars(-id), any_vars(!is.na(.))) %>%
select(-time1)
# id time data1 data2
# <dbl> <chr> <dbl> <dbl>
# 1 1 201601 100 5
# 2 1 201602 150 6
# 3 1 201603 160 9
# 4 1 201604 NA NA
# 5 2 201601 NA NA
# 6 2 201602 NA NA
# 7 2 201603 NA NA
# 8 2 201604 NA NA
# 9 3 201601 111 3
#10 3 201602 120 2
#11 3 201603 130 1
#12 3 201604 NA NA
#13 10 201601 150 0
#14 10 201602 NA NA
#15 10 201603 NA NA
#16 10 201604 NA NA
This matches your exact final output:
library(data.table)
setnames(dataf2, "data2", "data1") # Warning: This will modify the original dataf2
rbindlist(
list(dataf1, dataf2),
fill = TRUE
)[CJ(id, time, unique = TRUE), on = .(id, time)]
# id time data1 data2
# 1: 1 201601 100 5
# 2: 1 201602 150 6
# 3: 1 201603 160 9
# 4: 1 201604 NA NA
# 5: 2 201601 NA NA
# 6: 2 201602 NA NA
# 7: 2 201603 NA NA
# 8: 2 201604 11 NA
# 9: 3 201601 111 3
# 10: 3 201602 120 2
# 11: 3 201603 130 1
# 12: 3 201604 20 NA
# 13: 10 201601 150 0
# 14: 10 201602 NA NA
# 15: 10 201603 NA NA
# 16: 10 201604 30 NA

How to split a data set with duplicated informations based on date

I have this situation:
ID date Weight
1 2014-12-02 23
1 2014-10-02 25
2 2014-11-03 27
2 2014-09-03 45
3 2014-07-11 56
3 NA 34
4 2014-10-05 25
4 2014-08-09 14
5 NA NA
5 NA NA
And I would like split the dataset in this, like this:
1-
ID date Weight
1 2014-12-02 23
1 2014-10-02 25
2 2014-11-03 27
2 2014-09-03 45
4 2014-10-05 25
4 2014-08-09 14
2- Lowest Date
ID date Weight
3 2014-07-11 56
3 NA 34
5 NA NA
5 NA NA
I tried this for second dataset:
dt <- dt[order(dt$ID, dt$date), ]
dt.2=dt[duplicated(dt$ID), ]
but didn't work
Get the ID's for which date are NA and then subset based on that
NA_ids <- unique(df$ID[is.na(df$date)])
subset(df, !ID %in% NA_ids)
# ID date Weight
#1 1 2014-12-02 23
#2 1 2014-10-02 25
#3 2 2014-11-03 27
#4 2 2014-09-03 45
#7 4 2014-10-05 25
#8 4 2014-08-09 14
subset(df, ID %in% NA_ids)
# ID date Weight
#5 3 2014-07-11 56
#6 3 <NA> 34
#9 5 <NA> NA
#10 5 <NA> NA
Using dplyr, we can create a new column which has TRUE/FALSE for each ID based on presence of NA and then use group_split to split into list of two.
library(dplyr)
df %>%
group_by(ID) %>%
mutate(NA_ID = any(is.na(date))) %>%
ungroup %>%
group_split(NA_ID, keep = FALSE)
The above dplyr logic can also be implemented in base R by using ave and split
df$NA_ID <- with(df, ave(is.na(date), ID, FUN = any))
split(df[-4], df$NA_ID)

Resources