keep a column when expanding dataframe to fill using tidyr::complete - r

Regarding how to fill missing rows in a data frame i used this example
df <- read.table(textConnection("car,year,month,country,amount
Mazda,2012,02,JP,2344
Ford,2012,04,US,235234
Mazda,2012,03,JP,3455
Mazda,2012,04,JP,43554
Mazda,2012,05,JP,9854
Mazda,2012,06,JP,32556
Ford, 2013,01,US,345"), sep = ",", header = TRUE)
> df
car year month country amount
1 Mazda 2012 2 JP 2344
2 Ford 2012 4 US 235234
3 Mazda 2012 3 JP 3455
4 Mazda 2012 4 JP 43554
5 Mazda 2012 5 JP 9854
6 Mazda 2012 6 JP 32556
7 Ford 2013 1 US 345
I use tidyr::complete to fill missing rows for month and year this way:
tidyr::complete(df, car = unique(car), year = 2012:2014, month=1:12, fill=list(amount=0))
but country is lost. i've read tidyr documentation but it's really short and could'nt find any other SO answer on this.
# A tibble: 108 x 5
car year month country amount
<fct> <int> <int> <fct> <dbl>
1 " Ford" 2012 1 NA 0
2 " Ford" 2012 2 NA 0
3 " Ford" 2012 3 NA 0
4 " Ford" 2012 4 US 235234
5 " Ford" 2012 5 NA 0
6 " Ford" 2012 6 NA 0
7 " Ford" 2012 7 NA 0
8 " Ford" 2012 8 NA 0
9 " Ford" 2012 9 NA 0
10 " Ford" 2012 10 NA 0
# ... with 98 more rows
How to preserve it?

We can place it in nesting
library(tidyverse)
df %>%
complete(car = unique(car), year = 2012:2014, month = 1:12,
nesting(country), fill = list(amount = 0))

Since you neglected to note that you opened a new question in the second-ask on the original, just maintain a metadata data frame:
read.table(textConnection("car,year,month,amount
Mazda,2012,02,2344
Ford,2012,04,235234
Mazda,2012,03,3455
Mazda,2012,04,43554
Mazda,2012,05,9854
Mazda,2012,06,32556
Ford,2013,01,2345"),
sep = ",", header = TRUE, stringsAsFactors = FALSE) -> xdf
merge(
expand.grid(car = unique(xdf$car), year =2012:2014, month=1:12),
xdf, by = c("car", "year", "month"), all.x = TRUE
) -> x2
x2$amount <- ifelse(is.na(x2$amount), 0, x2$amount)
data.frame(
car = c("Mazda", "Ford"),
country = c("JP", "US"),
stringsAsFactors = FALSE
) -> car2country_df
merge(x2, car2country_df)
or via tidyverse:
tidyr::complete(
xdf, car = unique(car), year = 2012:2014, month=1:12, fill=list(amount=0)
) %>%
dplyr::left_join(car2country_df)

Related

How to speed up the computation of the intersections between each pair of sets for a large number of pairs

I have the following dataframe:
> str(database)
'data.frame': 8547287 obs. of 4 variables:
$ cited_id : num 4.06e+08 5.41e+07 5.31e+07 5.04e+07 3.79e+08 ...
$ cited_pub_year : num 2014 1989 2002 2002 2015 ...
$ citing_id : num 3.34e+08 3.37e+08 4.06e+08 4.19e+08 4.25e+08 ...
$ citing_pub_year: num 2011 2011 2013 2014 2014 ...
The variables cited_id and citing_id contain the IDs of the objects from which this database has been obtained.
This is an example of the dataframe:
cited_id cited_pub_year citing_id citing_pub_year
1 405821349 2014 419185055 2011
2 405821349 1989 336621202 2011
3 53148996 2002 406314162 2013
4 53148996 2002 419185055 2014
5 379369076 2015 424901495 2014
6 53148996 2011 441055669 2015
7 405821349 2014 447519383 2015
8 405821349 2015 469644221 2016
9 329268142 2014 470861263 2016
10 45433355 2008 55422577 2008
For example the ID 405821349 has been cited by 419185055, 336621202, 447519383 and 469644221. For each pair of IDs I would like to calculate the intersection of their citing IDs. The quantity Pj.k below is the length of the intersection. I tried with the following code
total_id<-c(database$cited_id,database$citing_id)
total_id<-unique(total_id)
df<-data.frame(data_k=character(),data_j=character(),Pj.k=numeric(),
stringsAsFactors = F)
for (k in 1:(length(total_id)-1)) {
data_k<-total_id[k]
citing_data_k<-database[database$cited_id==data_k,]
for (j in (k+1):length(total_id)) {
data_j<-total_id[j]
citing_data_j<-database[database$cited_id==data_j,]
Pj.k<-length(intersect(citing_data_j$citing_id,citing_data_k$citing_id))
dfxx=data.frame(data_k=data_k,data_j=data_j,Pj.k=Pj.k,
stringsAsFactors = F)
df<-rbind(df,dfxx)
}
}
Anyway, it takes too long! How could I speed it up?
Using xtabs, tcrossprod and sparse matrices:
library(Matrix)
library(data.table)
m2 <- as(
triu(
tcrossprod(
m1 <- xtabs(data = database[,c(1, 3)], sparse = TRUE)
), k = 1
), "TsparseMatrix"
)
df <- data.frame(
data_k = row.names(m1)[attr(m2, "i") + 1L],
data_j = row.names(m1)[attr(m2, "j") + 1L],
Pj.k = attr(m2, "x"),
stringsAsFactors = FALSE
)
Inspired by answers in Count combinations of categorical variables, regardless of order, in R? , count pairs:
database = read.table(header = T, stringsAsFactors = F, text =
"cited_id cited_pub_year citing_id citing_pub_year
1 405821349 2014 419185055 2011
2 405821349 1989 336621202 2011
3 53148996 2002 406314162 2013
4 53148996 2002 419185055 2014
5 379369076 2015 424901495 2014
6 53148996 2011 441055669 2015
7 405821349 2014 447519383 2015
8 405821349 2015 469644221 2016
9 329268142 2014 470861263 2016
10 45433355 2008 55422577 2008")
database |>
dplyr::count(pairs = paste(pmin(cited_id, citing_id),
pmax(cited_id, citing_id)))
#> pairs n
#> 1 329268142 470861263 1
#> 2 336621202 405821349 1
#> 3 379369076 424901495 1
#> 4 405821349 419185055 1
#> 5 405821349 447519383 1
#> 6 405821349 469644221 1
#> 7 45433355 55422577 1
#> 8 53148996 406314162 1
#> 9 53148996 419185055 1
#> 10 53148996 441055669 1
Depending on what you actually need you might find with(database, table(cited_id = cited_id, citing_id = citing_id)) useful too.

Dropping the rows by checking whether it has multiple values in R

I have a data frame in this form;
Year Department Jan Feb ................... Dec
2017 TF 15.15 225.51 .............. 5562.1
2015 CIF ...................................
2013 TTR ....................................
2011 COR ....................
. .............................
. ......................
As a summary, I want to create an algorithm but first I have to make this filtering:
If a department does not have a value for 2013, 2014, 2015, 2016 years, than I want to exclude that department from my data set.
In other words, by reading the each departments data, filtering the data by departments that has all four years values in the months columns.
I tried exists, is.na but the multiple filtering always fails. And another handicap is that filter works for only single condition, but here I need like 4 condition. 4 years values must be exist to use them in next step.
Thank you.
I can't find a clear duplicate to this question. Seems like a quick fix with group_by:
library(dplyr)
df <- data_frame(Year = c(2013:2016, 2015, 2016),
Department = c(rep('TF', 4), 'CIF', 'TTR'))
df
#> # A tibble: 6 x 2
#> Year Department
#> <dbl> <chr>
#> 1 2013 TF
#> 2 2014 TF
#> 3 2015 TF
#> 4 2016 TF
#> 5 2015 CIF
#> 6 2016 TTR
df %>%
group_by(Department) %>%
mutate(x = Year %in% c(2013:2016),
y = sum(x)) %>%
ungroup() %>%
filter(y == 4)
#> # A tibble: 4 x 4
#> Year Department x y
#> <dbl> <chr> <lgl> <int>
#> 1 2013 TF TRUE 4
#> 2 2014 TF TRUE 4
#> 3 2015 TF TRUE 4
#> 4 2016 TF TRUE 4
A solution using R base:
df = read.table(text = "Year, Department
2016,TF
2017,TF
2013,CIF
2014,CIF
2015,CIF
2016,CIF
2013,TTR", header = TRUE, sep = ",", stringsAsFactors = FALSE)
df[df$Department %in% subset(aggregate(subset(df, Year %in% c(2013,2014,2015,2016)), by=list(n$Department), FUN=length), Department==4)[,1], ]
Output:
Year Department
3 2013 CIF
4 2014 CIF
5 2015 CIF
6 2016 CIF

arrange one below the other every 2 columns from data frame in R

Hi I have a df as below which show date and their respected
date 1_val date 2_val . . . . date n_val
2014 23 2014 33 . . . . 2014 34
2015 22 2016 12 . . . . 2016 99
i have tried with hard coding to arrange the columns one below the other
for 1&2 columns
a=1
b=2
names_2<-df[,c(a,b)]
colnames(names_2)[1]<-"Date"
names_2 <- names_2[!apply(is.na(names_2) | names_2 == "", 1, all),]
names_2<-melt(names_2,id=colnames(names_2)[1])
samp_out<-names_2
for 3&4 columns
a=3
b=4
names_2<-df[,c(a,b)]
colnames(names_2)[1]<-"Date"
names_2 <- names_2[!apply(is.na(names_2) | names_2 == "", 1, all),]
names_2<-melt(names_2,id=colnames(names_2)[1])
samp_out1<-names_2
till n-numbers
df1= rbind(samp_out,samp_out1,......samp_out_n)
output
date variable value
2014 1_val 23
2015 1_val 22
2014 2_val 33
2016 2_val 12
.
.
2014 n_val 34
2016 n_val 99
Thanks in advance
The function melt in the package data.table does that:
melt(df, id = "Date", measure = patterns("_val"))
You can specify the name of the variable to pivot on (Date in this case) and a pattern in the variables you want to keep the values of. You can also supply a vector with all the variablenames instead.
> DT <- data.table(Date = c(2014,2013), `1_val` = c(33, 32), Date = c(2014, 2013), `2_val` = c(65, 34))
> DT
Date 1_val Date 2_val
1: 2014 33 2014 65
2: 2013 32 2013 34
> melt(DT, id = "Date", measure = patterns("_val"))
Date variable value
1: 2014 1_val 33
2: 2013 1_val 32
3: 2014 2_val 65
4: 2013 2_val 34
You can use stack from base R,
setNames(data.frame(stack(df[c(TRUE, FALSE)])[1],
stack(df[c(FALSE, TRUE)])),
c('date', 'value', 'variable'))
# date value variable
#1 2014 33 1_val
#2 2013 32 1_val
#3 2014 65 2_val
#4 2013 34 2_val
Define the untidy rectangle
library(magrittr)
csv <- "date,1_val,date,2_val,date,3_val
2014,23,2014,33,2014,34
2015,22,2016,12,2016,99"
Read into a data frame, then transform into a long/eav rectangle.
ds_eav <- csv %>%
readr::read_csv() %>%
tibble::rownames_to_column(var="height") %>%
tidyr::gather(key=key, value=value, -height)
output:
# A tibble: 12 x 4
key index value height
<chr> <int> <int> <int>
1 date 1 2014 1
2 date 1 2015 2
3 value 1 23 1
4 value 1 22 2
5 date 2 2014 1
6 date 2 2016 2
7 value 2 33 1
8 value 2 12 2
9 date 3 2014 1
10 date 3 2016 2
11 value 3 34 1
12 value 3 99 2
Identify which rows are dates/values. Then shift up dates' index by 1.
ds_eav <- ds_eav %>%
dplyr::mutate(
index_val = sub("^(\\d+)_val$" , "\\1", key),
index_date = sub("^date_(\\d+)$", "\\1", key),
index_date = dplyr::if_else(key=="date", "0", index_date),
key = dplyr::if_else(grepl("^date(_\\d+)*", key), "date", "value"),
index = dplyr::if_else(key=="date", index_date, index_val),
index = as.integer(index),
index = index + dplyr::if_else(key=="date", 1L, 0L)
) %>%
dplyr::select(key, index, value, height)
Follow the advice of #jarko-dubbeldam and use spread/gather on the last step too
ds_eav %>%
tidyr::spread(key=key, value=value)
output:
# A tibble: 6 x 4
index height date value
* <int> <int> <int> <int>
1 1 1 2014 23
2 1 2 2015 22
3 2 1 2014 33
4 2 2 2016 12
5 3 1 2014 34
6 3 2 2016 99
You can use paste0(index, "_val") to get you exact output. But I'd prefer to keep them as integers, so you can do math on them in necessary (eg, max()).
edit 1: incorporate the advice & corrections of #jarko-dubbeldam and #hnskd.
edit 2: use rownames_to_column() in case the input isn't a balanced rectangle (eg, one column doesn't all all the rows).

R Cleaning and reordering names/serial numbers in data frame

Let's say I have a data frame as follows in R:
Data <- data.frame("SerialNum" = character(), "Year" = integer(), "Name" = character(), stringsAsFactors = F)
Data[1,] <- c("983\n837\n424\n ", 2015, "Michael\nLewis\nPaul\n ")
Data[2,] <- c("123\n456\n789\n136", 2014, "Elaine\nJerry\nGeorge\nKramer")
Data[3,] <- c("987\n654\n321\n975\n ", 2010, "John\nPaul\nGeorge\nRingo\nNA")
Data[4,] <- c("424\n983\n837", 2015, "Paul\nMichael\nLewis")
Data[5,] <- c("456\n789\n123\n136", 2014, "Jerry\nGeorge\nElaine\nKramer")
What I want to do is the following:
Split up each string of names and each string of serial numbers so that they are their own vectors (or a list of string vectors).
Eliminate any character "NA" in either set of vectors or any blank spaces denoted by "...\n ".
Reorder each list of names alphabetically and reorder the corresponding serial numbers according to the same permutation.
Concatenate each vector in the same fashion it was originally (I usually do this with paste(., collapse = "\n")).
My issue is how to do this without using a for loop. What is an object-oriented way to do this? As a first attempt in this direction I originally made a list by the command LIST <- strsplit(Data$Name, split = "\n") and from here I need a for loop in order to find the permutations of the names, which seems like a process that won't scale according to my actual data. Additionally, once I make the list LIST I'm not sure how I go about removing NA symbols or blank spaces. Any help is appreciated!
Using lapply I take each row of the data frame and turn it into a new data frame with one name per row. This creates a list of 5 data frames, one for each row of the original data frame.
seinfeld = lapply(1:nrow(Data), function(i) {
# Turn strings into data frame with one name per row
dat = data.frame(SerialNum=unlist(strsplit(Data[i,"SerialNum"], split="\n")),
Year=Data[i,"Year"],
Name=unlist(strsplit(Data[i,"Name"], split="\n")))
# Get rid of empty strings and NA values
dat = dat[!(dat$Name %in% c(""," ","NA")), ]
# Order alphabetically
dat = dat[order(dat$Name), ]
})
UPDATE: Based on your comment, let me know if this is the result you're trying to achieve:
seinfeld = lapply(1:nrow(Data), function(i) {
# Turn strings into data frame with one name per row
dat = data.frame(SerialNum=unlist(strsplit(Data[i,"SerialNum"], split="\n")),
Name=unlist(strsplit(Data[i,"Name"], split="\n")))
# Get rid of empty strings and NA values
dat = dat[!(dat$Name %in% c(""," ","NA")), ]
# Order alphabetically
dat = dat[order(dat$Name), ]
# Collapse back into a single row with the new sort order
dat = data.frame(SerialNum=paste(dat[, "SerialNum"], collapse="\n"),
Year=Data[i, "Year"],
Name=paste(dat[, "Name"], collapse="\n"))
})
do.call(rbind, seinfeld)
SerialNum Year Name
1 837\n983\n424 2015 Lewis\nMichael\nPaul
2 123\n789\n456\n136 2014 Elaine\nGeorge\nJerry\nKramer
3 321\n987\n654\n975 2010 George\nJohn\nPaul\nRingo
4 837\n983\n424 2015 Lewis\nMichael\nPaul
5 123\n789\n456\n136 2014 Elaine\nGeorge\nJerry\nKramer
eipi10 offered a great answer. In addition to that, I'd like to leave what I tried mainly with data.table. First, I split two columns (i.e., SerialNum and Name) with cSplit(), added an index with add_rownames(), and split the data by the index. In the first lapply(), I used Stacked() from the splitstackshape package. I stacked SerialNum and Name; separated SeriaNum and Name become two columns, as you see in a part of temp2. In the second lapply(), I used merge from the data.table package. Then, I removed rows with NAs (lapply(na.omit)), combined all data tables (rbindlist), and changed order of rows by rowname, which is row number of the original data) and Name (setorder(rowname, Name))
library(data.table)
library(splitstackshape)
library(dplyr)
cSplit(mydf, c("SerialNum", "Name"), direction = "wide",
type.convert = FALSE, sep = "\n") %>%
add_rownames %>%
split(f = .$rowname) -> temp
#a part of temp
#$`1`
#Source: local data frame [1 x 12]
#
#rowname Year SerialNum_1 SerialNum_2 SerialNum_3 SerialNum_4 SerialNum_5 Name_1 Name_2
#(chr) (dbl) (chr) (chr) (chr) (chr) (chr) (chr) (chr)
#1 1 2015 983 837 424 NA NA Michael Lewis
#Variables not shown: Name_3 (chr), Name_4 (chr), Name_5 (chr)
lapply(temp, function(x){
Stacked(x, var.stubs = c("SerialNum", "Name"), sep = "_")
}) -> temp2
# A part of temp2
#$`1`
#$`1`$SerialNum
# rowname Year .time_1 SerialNum
#1: 1 2015 1 983
#2: 1 2015 2 837
#3: 1 2015 3 424
#4: 1 2015 4 NA
#5: 1 2015 5 NA
#
#$`1`$Name
# rowname Year .time_1 Name
#1: 1 2015 1 Michael
#2: 1 2015 2 Lewis
#3: 1 2015 3 Paul
#4: 1 2015 4 NA
#5: 1 2015 5 NA
lapply(1:nrow(mydf), function(x){
merge(temp2[[x]]$SerialNum, temp2[[x]]$Name, by = c("rowname", "Year", ".time_1"))
}) %>%
lapply(na.omit) %>%
rbindlist %>%
setorder(rowname, Name) -> out
print(out)
# rowname Year .time_1 SerialNum Name
# 1: 1 2015 2 837 Lewis
# 2: 1 2015 1 983 Michael
# 3: 1 2015 3 424 Paul
# 4: 2 2014 1 123 Elaine
# 5: 2 2014 3 789 George
# 6: 2 2014 2 456 Jerry
# 7: 2 2014 4 136 Kramer
# 8: 3 2010 3 321 George
# 9: 3 2010 1 987 John
#10: 3 2010 2 654 Paul
#11: 3 2010 4 975 Ringo
#12: 4 2015 3 837 Lewis
#13: 4 2015 2 983 Michael
#14: 4 2015 1 424 Paul
#15: 5 2014 3 123 Elaine
#16: 5 2014 2 789 George
#17: 5 2014 1 456 Jerry
#18: 5 2014 4 136 Kramer
DATA
mydf <- structure(list(SerialNum = c("983\n837\n424\n ", "123\n456\n789\n136",
"987\n654\n321\n975\n ", "424\n983\n837", "456\n789\n123\n136"
), Year = c(2015, 2014, 2010, 2015, 2014), Name = c("Michael\nLewis\nPaul\n ",
"Elaine\nJerry\nGeorge\nKramer", "John\nPaul\nGeorge\nRingo\nNA",
"Paul\nMichael\nLewis", "Jerry\nGeorge\nElaine\nKramer")), .Names = c("SerialNum",
"Year", "Name"), row.names = c(NA, -5L), class = "data.frame")

Clean way to calculate both group and overall statistics

I would like like to calculate the median not only for different groups of my data, but also the median over all groups and store the result in a single data.frame. While accomplishing each of these tasks separately is easy, I have not found a clean way to do both at the same time.
Right now, what I'm doing is calculate both statistics separately; then join the results; then tidy the data if necessary. Here's an example of what this may look like if I wanted to know the median delay per day and per month:
library(dplyr)
library(hflights)
data(hflights)
# Calculate both statistics separately
per_day <- hflights %>%
group_by(Year, Month, DayofMonth) %>%
summarise(Delay = mean(ArrDelay, na.rm = TRUE)) %>%
mutate(Interval = "Daily")
per_month <- hflights %>%
group_by(Year, Month) %>%
summarise(Delay = mean(ArrDelay, na.rm = TRUE)) %>%
mutate(Interval = "Monthly", DayofMonth = NA)
# Join into a single data.frame
my_summary <- full_join(per_day, per_month,
by = c("Year", "Month", "DayofMonth", "Interval", "Delay"))
my_summary
# Source: local data frame [377 x 5]
# Groups: Year, Month
#
# Year Month DayofMonth Delay Interval
# 1 2011 1 1 10.067642 Daily
# 2 2011 1 2 10.509745 Daily
# 3 2011 1 3 6.038627 Daily
# 4 2011 1 4 7.970740 Daily
# 5 2011 1 5 4.172650 Daily
# 6 2011 1 6 6.069909 Daily
# 7 2011 1 7 3.907295 Daily
# 8 2011 1 8 3.070140 Daily
# 9 2011 1 9 17.254325 Daily
# 10 2011 1 10 11.040388 Daily
# .. ... ... ... ... ...
Are there better ways to do this?
(Note that in many cases one could easily progressively roll up summaries as pointed out in the Introduction to dplyr. However, this doesn't work for statistics like median, mean etc.)
As a one-off table. This is fairly straightforward in data.table:
require(data.table)
setDT(hflights)[,{
mo_del <- mean(ArrDelay,na.rm=TRUE)
.SD[,.(DailyDelay = mean(ArrDelay,na.rm=TRUE),MonthlyDelay = mo_del),by=DayofMonth]
},by=.(Year,Month)]
# Year Month DayofMonth DailyDelay MonthlyDelay
# 1: 2011 1 1 10.0676417 4.926065
# 2: 2011 1 2 10.5097451 4.926065
# 3: 2011 1 3 6.0386266 4.926065
# 4: 2011 1 4 7.9707401 4.926065
# 5: 2011 1 5 4.1726496 4.926065
# ---
# 361: 2011 12 14 1.0293610 5.013244
# 362: 2011 12 17 -0.1049822 5.013244
# 363: 2011 12 24 -4.1457490 5.013244
# 364: 2011 12 25 -2.2976827 5.013244
# 365: 2011 12 31 46.4846491 5.013244
How it works. The basic syntax is DT[i,j,by].
With by=.(Year,Month), all operations in j are done per "by group."
We can nest another "by group" using the data.table of the current Subset of Data, .SD.
To return columns in j we use .(colname1=col1,colname2=col2,...).
Creating new variables. Alternately, we could create new variables in hflights using := in j.
hflights[,DailyDelay := mean(ArrDelay,na.rm=TRUE),.(Year,Month,DayofMonth)]
hflights[,MonthlyDelay := mean(ArrDelay,na.rm=TRUE),.(Year,Month)]
Then we can view the summary table:
hflights[,.GRP,.(Year,Month,DayofMonth,DailyDelay,MonthlyDelay)]
# Year Month DayofMonth DailyDelay MonthlyDelay .GRP
# 1: 2011 1 1 10.0676417 4.926065 1
# 2: 2011 1 2 10.5097451 4.926065 2
# 3: 2011 1 3 6.0386266 4.926065 3
# 4: 2011 1 4 7.9707401 4.926065 4
# 5: 2011 1 5 4.1726496 4.926065 5
# ---
# 361: 2011 12 14 1.0293610 5.013244 361
# 362: 2011 12 17 -0.1049822 5.013244 362
# 363: 2011 12 24 -4.1457490 5.013244 363
# 364: 2011 12 25 -2.2976827 5.013244 364
# 365: 2011 12 31 46.4846491 5.013244 365
(Something needed to be put in j here, so I used the "by group" code, .GRP.)

Resources