Updating categories with too few observations - r

Please note that this question has been edited after r2evans' answer.
Example data
I have example data as follows:
library(data.table)
vars_of_interest <- c("A", "B", "C")
vars_of_interest_obs_tot <- c("A_tot", "B_tot", "C_tot")
adapted_BMstratum <- c("A_adapted_BMstratum", "B_adapted_BMstratum", "C_adapted_BMstratum")
full_df_bm <- fread("A B C BMstratum
1 NA NA 1110
23 1 2 1120
1 NA 1 1130
6 NA NA 1140
NA 1 1 1100
2 2 4 1110
NA 1 2 1120
NA 21 11 1130")
# Counting the current observations
setDT(full_df_bm)[, (vars_of_interest_obs_tot) := lapply(vars_of_interest, function(x) sum(!is.na(get(x)))),by = c("BMstratum")]
print(full_df_bm)
# A B C BMstratum A_tot B_tot C_tot
# 1: 1 NA NA 1110 2 1 1
# 2: 23 1 2 1120 1 2 2
# 3: 1 NA 1 1130 1 1 2
# 4: 6 NA NA 1140 1 0 0
# 5: NA 1 1 1100 0 1 1
# 6: 2 2 4 1110 2 1 1
# 7: NA 1 2 1120 1 2 2
# 8: NA 21 11 1130 1 1 2
# The adapted strata start the same as the original
setDT(full_df_bm)[, (adapted_BMstratum):=BMstratum]
print(full_df_bm)
# A B C BMstratum A_tot B_tot C_tot A_adapted_BMstratum B_adapted_BMstratum C_adapted_BMstratum
# 1: 1 NA NA 1110 2 1 1 1110 1110 1110
# 2: 23 1 2 1120 1 2 2 1120 1120 1120
# 3: 1 NA 1 1130 1 1 2 1130 1130 1130
# 4: 6 NA NA 1140 1 0 0 1140 1140 1140
# 5: NA 1 1 1100 0 1 1 1100 1100 1100
# 6: 2 2 4 1110 2 1 1 1110 1110 1110
# 7: NA 1 2 1120 1 2 2 1120 1120 1120
# 8: NA 21 11 1130 1 1 2 1130 1130 1130
Updating the strata
For every variable in adapted_BMstratum, I would like to manually decide what to do when there are less than 2 observations for each of the variables A, B, or C.
for (i in seq_along(adapted_BMstratum)) {
# If stratum 1110 has less than two observations change to 1120
setDT(full_df_bm)[get(vars_of_interest_obs_tot[i])<2 & get(adapted_BMstratum[i])==1110, (adapted_BMstratum[i]):=1120 ,]
# Update the observations
bygroup <- adapted_BMstratum[i]
setDT(full_df_bm)[, (vars_of_interest_obs_tot) := lapply(vars_of_interest, function(x) sum(!is.na(get(x)))),by = bygroup]
# If stratum 1120 has less than two observations change to 1110
setDT(full_df_bm)[get(vars_of_interest_obs_tot[i])<2 & get(adapted_BMstratum[i])==1120, (adapted_BMstratum[i]):=1110,]
# Update the observations
bygroup <- adapted_BMstratum[i]
setDT(full_df_bm)[, (vars_of_interest_obs_tot) := lapply(vars_of_interest, function(x) sum(!is.na(get(x)))),by = bygroup]
# If stratum 1130 has less than two observations change to 1110
setDT(full_df_bm)[get(vars_of_interest_obs_tot[i])<2 & get(adapted_BMstratum[i])==1120, (adapted_BMstratum[i]):=1110,]
# Update the observations
bygroup <- adapted_BMstratum[i]
setDT(full_df_bm)[, (vars_of_interest_obs_tot) := lapply(vars_of_interest, function(x) sum(!is.na(get(x)))),by = bygroup]
# If any strata after has less than 2 observations, change them all to 1110
setDT(full_df_bm)[get(vars_of_interest_obs_tot[i])<2 & (get(adapted_BMstratum[i])==1110 || get(adapted_BMstratum[i])==1120 || get(adapted_BMstratum[i])==1130), (adapted_BMstratum[i]):=1110,]
# Update the observations a last time
bygroup <- adapted_BMstratum[i]
setDT(full_df_bm)[, (vars_of_interest_obs_tot) := lapply(vars_of_interest, function(x) sum(!is.na(get(x)))),by = bygroup]
}
This does however not give the desired outcome:
A B C BMstratum A_tot B_tot C_tot A_adapted_BMstratum B_adapted_BMstratum C_adapted_BMstratum
1: 1 NA NA 1110 2 1 1 1110 1110 1110
2: 23 1 2 1120 1 2 2 1110 1120 1120
3: 1 NA 1 1130 1 1 2 1110 1110 1130
4: 6 NA NA 1140 1 0 0 1110 1110 1110
5: NA 1 1 1100 0 1 1 1110 1110 1110
6: 2 2 4 1110 2 1 1 1110 1110 1110
7: NA 1 2 1120 1 2 2 1110 1120 1120
8: NA 21 11 1130 1 1 2 1110 1110 1130
In addition it gives the following warnings:
Warning messages:
1: In get(adapted_BMstratum[i]) == 1110 || get(adapted_BMstratum[i]) == :
'length(x) = 8 > 1' in coercion to 'logical(1)'
2: In get(adapted_BMstratum[i]) == 1110 || get(adapted_BMstratum[i]) == :
'length(x) = 8 > 1' in coercion to 'logical(1)'
3: In get(adapted_BMstratum[i]) == 1110 || get(adapted_BMstratum[i]) == :
'length(x) = 8 > 1' in coercion to 'logical(1)'
Desired outcome
NOTE: For B_adapted_stratum all have been changed to 1110 because 1110,1120 and 1130, (if they exist) did not all have at least 2 observations.
# A B C BMstratum A_tot B_tot C_tot A_adapted_BMstratum B_adapted_BMstratum C_adapted_BMstratum
# 1: 1 NA NA 1110 4 6 4 1110 1120 1120
# 2: 23 1 2 1120 4 6 4 1110 1120 1120
# 3: 1 NA 1 1130 4 6 2 1110 1120 1130
# 4: 6 NA NA 1140 1 0 0 1140 1140 1140
# 5: NA 1 1 1100 0 1 1 1100 1100 1100
# 6: 2 2 4 1110 4 6 4 1110 1120 1120
# 7: NA 1 2 1120 4 6 4 1110 1120 1120
# 8: NA 21 11 1130 4 6 2 1110 1120 1130
Note: The strata 1100 and 1140 should not be touched, but should not be removed either. This has to do with the fact that I need to add manual rules for these numbers separately. In the real data, there are way more numbers and rules, and I think it would become to messy to write everything out.

Here's a start, though I don't know how to assign 1120 to A_adapted_BMstratum since the two categories are identical:
full_df_bm[, c(adapted_BMstratum) := lapply(.SD, function(z) fifelse(z < 2, BMstratum[which.min(z)] , BMstratum)),
.SDcols = vars_of_interest_obs_tot]
# A_tot B_tot C_tot BMstratum A_adapted_BMstratum B_adapted_BMstratum C_adapted_BMstratum
# <int> <int> <int> <int> <int> <int> <int>
# 1: 1 2 1 1110 1110 1110 1110
# 2: 1 1 2 1120 1110 1120 1120

Related

R fill new column based on interval from another dataset (lookup)

Lets say I have this dataset:
df1 = data.frame(groupID = c(rep("a", 6), rep("b", 6), rep("c", 6)),
testid = c(111, 222, 333, 444, 555, 666, 777, 888, 999, 1010, 1111, 1212, 1313, 1414, 1515, 1616, 1717, 1818))
df1
groupID testid
1 a 111
2 a 222
3 a 333
4 a 444
5 a 555
6 a 666
7 b 777
8 b 888
9 b 999
10 b 1010
11 b 1111
12 b 1212
13 c 1313
14 c 1414
15 c 1515
16 c 1616
17 c 1717
18 c 1818
And I have this 2nd dataset:
df2 = data.frame(groupID = c("a", "a", "a", "a", "b", "b", "b", "c", "c", "c"),
testid = c(222, 333, 555, 666, 777, 999, 1010, 1313, 1616, 1818),
bd = c(1, 1, 2, 2, 0, 1, 1, 1, 1, 2))
df2
groupID testid bd
1 a 222 1
2 a 333 1
3 a 555 2
4 a 666 2
5 b 777 0
6 b 999 1
7 b 1010 1
8 c 1313 1
9 c 1616 1
10 c 1818 2
I want to use the intervals in the 2nd dataset to fill in a new variable in the 1st dataset and autofill in values that have two occurances of a bd and NAs everywhere else by group.
Desired output:
groupID testid new_bd
1 a 111 NA
2 a 222 1
3 a 333 1
4 a 444 NA
5 a 555 2
6 a 666 2
7 b 777 0
8 b 888 NA
9 b 999 1
10 b 1010 1
11 b 1111 NA
12 b 1212 NA
13 c 1313 1
14 c 1414 1
15 c 1515 1
16 c 1616 1
17 c 1717 NA
18 c 1818 2
Ideally would like dplyr/tidyr solution but open to any approaches.
similar but these fill all values:
R: Filling timeseries values but only within last 12 months
R autofill blanks in variable until next value
I would start by modifying df2 to start and end of range. And you can loop or do anything else after.
grps <- df2 %>% group_by(groupID, bd) %>% summarize(start = min(testid), end = max(testid))
grps
groupID bd start end
<fct> <dbl> <dbl> <dbl>
1 a 1 222 333
2 a 2 555 666
3 b 0 777 777
4 b 1 999 1010
5 c 1 1313 1616
6 c 2 1818 1818
df1$bd <- NA
for(i in 1:nrow(grps)){
df1$bd[which(df1$test >= grps$start[i] & df1$test <= grps$end[i])] = grps$bd[i]
}
df1
groupID testid bd
1 a 111 NA
2 a 222 1
3 a 333 1
4 a 444 NA
5 a 555 2
6 a 666 2
7 b 777 0
8 b 888 NA
9 b 999 1
10 b 1010 1
11 b 1111 NA
12 b 1212 NA
13 c 1313 1
14 c 1414 1
15 c 1515 1
16 c 1616 1
17 c 1717 NA
18 c 1818 2
Maybe I have overlooked a simpler method but here is what I came up with using dplyr, we first create a left_join between df1 and df2 and fill bd column. We then group_by group_ID and bd and get first and last index of non-NA value in each group and replace values to NA which are less than minimum index and greater than maximum index.
library(dplyr)
left_join(df1, df2, by = c("groupID", "testid")) %>%
mutate(bd1 = bd) %>%
tidyr::fill(bd) %>%
group_by(groupID, bd) %>%
mutate(minRow = if (all(is.na(bd))) 1 else first(which(!is.na(bd1))),
maxRow = if (all(is.na(bd))) n() else last(which(!is.na(bd1))),
new_bd = replace(bd, is.na(bd1) & (row_number() < minRow |
row_number() > maxRow), NA)) %>%
ungroup() %>%
select(names(df1), new_bd)
# groupID testid new_bd
# <fct> <dbl> <dbl>
# 1 a 111 NA
# 2 a 222 1
# 3 a 333 1
# 4 a 444 NA
# 5 a 555 2
# 6 a 666 2
# 7 b 777 0
# 8 b 888 NA
# 9 b 999 1
#10 b 1010 1
#11 b 1111 NA
#12 b 1212 NA
#13 c 1313 1
#14 c 1414 1
#15 c 1515 1
#16 c 1616 1
#17 c 1717 NA
#18 c 1818 2
Here is a solution that works on my test data example above but wont run on my large dataset where I run into the problem of Error: cannot allocate vector of size 45.5 Gb. I believe it is related to the problem outlined here:"The same size explosion can happen if you have lots of the same level in both with otherwise different rows". In my actual dataset I'm looking at date variables, I didn't think this would effect the problem but maybe it does. I'm not sure if there is a work using fuzzyjoin as it works on a subset of the data.
library(tidyverse)
library(fuzzyjoin)
library(tidylog)
grps <- df2 %>% group_by(groupID, bd) %>% summarize(start = min(testid), end = max(testid))
grps
df1 %>%
fuzzy_left_join(grps,
by = c("groupID" = "groupID",
"testid" = "start",
"testid" = "end"),
match_fun = list(`==`, `>=`, `<=`)) %>%
select(groupID = groupID.x, testid, bd, start, end)
select: dropped 2 variables (groupID.x, groupID.y)
groupID testid bd start end
1 a 111 NA NA NA
2 a 222 1 222 333
3 a 333 1 222 333
4 a 444 NA NA NA
5 a 555 2 555 666
6 a 666 2 555 666
7 b 777 0 777 777
8 b 888 NA NA NA
9 b 999 1 999 1010
10 b 1010 1 999 1010
11 b 1111 NA NA NA
12 b 1212 NA NA NA
13 c 1313 1 1313 1616
14 c 1414 1 1313 1616
15 c 1515 1 1313 1616
16 c 1616 1 1313 1616
17 c 1717 NA NA NA
18 c 1818 2 1818 1818
data.table solution:
library(data.table)
> new <- setDT(grps)[setDT(df1),
+ .(groupID, testid, x.start, x.end, x.bd),
+ on = .(groupID, start <= testid, end >= testid)]
> new
groupID testid x.start x.end x.bd
1: a 111 NA NA NA
2: a 222 222 333 1
3: a 333 222 333 1
4: a 444 NA NA NA
5: a 555 555 666 2
6: a 666 555 666 2
7: b 777 777 777 0
8: b 888 NA NA NA
9: b 999 999 1010 1
10: b 1010 999 1010 1
11: b 1111 NA NA NA
12: b 1212 NA NA NA
13: c 1313 1313 1616 1
14: c 1414 1313 1616 1
15: c 1515 1313 1616 1
16: c 1616 1313 1616 1
17: c 1717 NA NA NA
18: c 1818 1818 1818 2
I think it may be done in fuzzyjoin using internal_join but I'm not sure?: https://github.com/dgrtwo/fuzzyjoin/issues/50

R: Keep row from duplicates (several columns) based on condition

Basically I want to:
If rows are duplicated on the combination of some specific columns, then keep only the row that has the lowest value on another column.
Example data (there's a lot more variance in my real data):
ID BilagNr Henstand Aftale Belob RP Pos Dps Udlign rykkedage
1 111 01-01-2017 1111 100 YA 1 1 10
1 122 02-01-2017 1222 100 YA 1 1 40
1 111 01-07-2017 1111 100 YA 1 1 100
2 222 01-01-2017 2121 299 YA 1 4 5
2 222 01-01-2017 2121 299 YA 1 4 98
2 212 01-05-2017 7654 299 BS 1 3
3 333 01-08-2017 7654 345 BS 2 45
4 444 01-01-2017 7654 345 BS 3 1 4 68
4 411 09-01-2017 7654 345 BS 1 4 43
5 555 01-01-2017 5555 700 BS 1 13
5 555 01-01-2017 5555 700 BS 1 67
6 666 01-01-2017 4720 100 BS 1 23
6 666 03-01-2017 1234 100 BS 2 1 23
6 666 07-08-2017 1234 120 BS 3 1 1 23
7 777 01-01-2017 1234 90 BS 1 1 23
7 777 01-01-2017 1234 90 BS 1 1 199
So I want to only keep these:
ID BilagNr Henstand Aftale Belob RP Pos Dps Udlign rykkedage
1 111 01-01-2017 1111 100 YA 1 1 10
1 122 02-01-2017 1222 100 YA 1 1 40
2 222 01-01-2017 2121 299 YA 1 4 5
2 212 01-05-2017 7654 299 BS 1 3
3 333 01-08-2017 7654 345 BS 2 45
4 444 01-01-2017 7654 345 BS 3 1 4 68
4 411 09-01-2017 7654 345 BS 1 4 43
5 555 01-01-2017 5555 700 BS 1 13
6 666 01-01-2017 4720 100 BS 1 23
6 666 03-01-2017 1234 100 BS 2 1 23
6 666 07-08-2017 1234 120 BS 3 1 1 23
7 777 01-01-2017 1234 90 BS 1 1 23
In other words:
If the rows are duplicated in a combination of the columns ID, BilagNr, Henstand, Aftale, Belob, RP, Pos, Dps, Udlign then keep only one of the duplicated rows and choose this from the condition that rykkedage has to be the smallest of the duplicated rows.
I hope it makes sense.
Furthermore, is it possible to add a code that keeps those duplicated rows that has the same value in rykkedage? I have a large dataset, and I'm not sure if this is even a problem.
Thank you!
We can group by 'ID', 'BilagNr', ..., 'Udlign', and then slice the rows with the index of the minimum value in 'rykkedage'
library(dplyr)
df1 %>%
group_by(ID, BilagNr, Henstand, Aftale, Belob, RP, Pos, Dps, Udlign) %>%
slice(which.min(rykkedage))
# A tibble: 13 x 10
# Groups: ID, BilagNr, Henstand, Aftale, Belob, RP, Pos, Dps, Udlign [13]
# ID BilagNr Henstand Aftale Belob RP Pos Dps Udlign rykkedage
# <int> <int> <chr> <int> <int> <chr> <int> <int> <int> <int>
# 1 1 111 01-01-2017 1111 100 YA 1 1 NA 10
# 2 1 111 01-07-2017 1111 100 YA 1 1 NA 100
# 3 1 122 02-01-2017 1222 100 YA 1 NA 1 40
# 4 2 212 01-05-2017 7654 299 BS 1 NA NA 3
# 5 2 222 01-01-2017 2121 299 YA 1 NA 4 5
# 6 3 333 01-08-2017 7654 345 BS 2 NA NA 45
# 7 4 411 09-01-2017 7654 345 BS 1 NA 4 43
# 8 4 444 01-01-2017 7654 345 BS 3 1 4 68
# 9 5 555 01-01-2017 5555 700 BS 1 NA NA 13
#10 6 666 01-01-2017 4720 100 BS 1 NA NA 23
#11 6 666 03-01-2017 1234 100 BS 2 NA 1 23
#12 6 666 07-08-2017 1234 120 BS 3 1 1 23
#13 7 777 01-01-2017 1234 90 BS 1 NA 1 23

Creating a rank column based on two other (linked) columns in R

I have the following dataframe (example data) which has the dates of different DVD recordings for different pairs of birds for numerous broods:
PairID BroodRef DVDdate
1 512 2004-05-22
1 512 2004-05-30
1 512 2004-05-26
1 588 2004-06-30
1 588 2004-07-04
1 588 2004-07-09
2 673 2004-07-19
3 543 2004-06-03
3 543 2004-06-07
3 543 2004-06-11
3 620 2004-07-19
3 39 2005-05-19
3 39 2005-05-23
What I'd like is a brood number for each pair, such as:
PairID BroodRef DVDdate BroodNumber
1 512 2004-05-22 1
1 512 2004-05-30 1
1 512 2004-05-26 1
1 588 2004-06-30 2
1 588 2004-07-04 2
1 588 2004-07-09 2
2 673 2004-07-19 1
3 543 2004-06-03 1
3 543 2004-06-07 1
3 543 2004-06-11 1
3 620 2004-07-19 2
3 39 2005-05-19 3
3 39 2005-05-23 3
I have tried
ddply(df,.(PairID),transform,BroodNumber = dense_rank(BroodRef))
which I saw on another question, but this results in Pair 3, BroodRef 39 being BroodNumber 1 rather than the 3 it should be.
Appreciate any help!
We could use rleid() from data.table to create a sequence based on BroodRef, grouped by PairID.
library(data.table)
setDT(df)[,BroodNumber := rleid(BroodRef), by = PairID]
# PairID BroodRef DVDdate BroodNumber
# 1: 1 512 2004-05-22 1
# 2: 1 512 2004-05-30 1
# 3: 1 512 2004-05-26 1
# 4: 1 588 2004-06-30 2
# 5: 1 588 2004-07-04 2
# 6: 1 588 2004-07-09 2
# 7: 2 673 2004-07-19 1
# 8: 3 543 2004-06-03 1
# 9: 3 543 2004-06-07 1
#10: 3 543 2004-06-11 1
#11: 3 620 2004-07-19 2
#12: 3 39 2005-05-19 3
#13: 3 39 2005-05-23 3
We can use dplyr
library(dplyr)
df1 %>%
group_by(PairID) %>%
mutate(BroodNumber = match(BroodRef, unique(BroodRef)))
# PairID BroodRef DVDdate BroodNumber
# (int) (int) (chr) (int)
#1 1 512 2004-05-22 1
#2 1 512 2004-05-30 1
#3 1 512 2004-05-26 1
#4 1 588 2004-06-30 2
#5 1 588 2004-07-04 2
#6 1 588 2004-07-09 2
#7 2 673 2004-07-19 1
#8 3 543 2004-06-03 1
#9 3 543 2004-06-07 1
#10 3 543 2004-06-11 1
#11 3 620 2004-07-19 2
#12 3 39 2005-05-19 3
#13 3 39 2005-05-23 3

Merging data frames with different number of rows and different columns

I have two data frames with different number of columns and rows. I want to combine them into one data frame.
> month.saf
Name NCDC Year Month Day HrMn Temp Q
244 AP 99999 2014 2 1 0 12 1
245 AP 99999 2014 2 1 300 12.2 1
246 AP 99999 2014 2 1 600 14.4 1
247 AP 99999 2014 2 1 900 18.6 1
248 AP 99999 2014 2 1 1200 18 1
249 AP 99999 2014 2 1 1500 13.6 1
250 AP 99999 2014 2 1 1800 11.8 1
251 AP 99999 2014 2 1 2100 10.8 1
252 AP 99999 2014 2 2 0 8.4 1
253 AP 99999 2014 2 2 300 8.6 1
254 AP 99999 2014 2 2 600 19.8 2
255 AP 99999 2014 2 2 900 22.8 1
256 AP 99999 2014 2 2 1200 20.8 1
257 AP 99999 2014 2 2 1500 16.4 1
258 AP 99999 2014 2 2 1800 13.4 1
259 AP 99999 2014 2 2 2100 12.4 1
> T2Mdf
V1 V2
0 293.494262695312 291.642639160156
300 294.003479003906 292.375091552734
600 296.809997558594 295.207885742188
900 298.287811279297 297.181549072266
1200 298.317565917969 297.725708007813
1500 298.134002685547 296.226165771484
1800 296.006805419922 293.354248046875
2100 293.785491943359 293.547210693359
0.1 294.638732910156 293.019866943359
300.1 292.179992675781 291.256958007812
The output that I want is like this:
Name NCDC Year Month Day HrMn Temp Q V1 V2
244 AP 99999 2014 2 1 0 12 1 293.4942627 291.6426392
245 AP 99999 2014 2 1 300 12.2 1 294.003479 292.3750916
246 AP 99999 2014 2 1 600 14.4 1 296.8099976 295.2078857
247 AP 99999 2014 2 1 900 18.6 1 298.2878113 297.1815491
248 AP 99999 2014 2 1 1200 18 1 298.3175659 297.725708
249 AP 99999 2014 2 1 1500 13.6 1 298.1340027 296.2261658
250 AP 99999 2014 2 1 1800 11.8 1 296.0068054 293.354248
251 AP 99999 2014 2 1 2100 10.8 1 293.7854919 293.5472107
252 AP 99999 2014 2 2 0 8.4 1 294.6387329 293.0198669
253 AP 99999 2014 2 2 300 8.6 1 292.1799927 291.256958
254 AP 99999 2014 2 2 600 19.8 2 292.2477417 291.3471069
255 AP 99999 2014 2 2 900 22.8 1 294.2276306 294.2766418
256 AP 99999 2014 2 2 1200 20.8 1 NA NA
257 AP 99999 2014 2 2 1500 16.4 1 NA NA
258 AP 99999 2014 2 2 1800 13.4 1 NA NA
259 AP 99999 2014 2 2 2100 12.4 1 NA NA
I tried cbindbut it gives me an error
Error in data.frame(..., check.names = FALSE) : arguments imply
differing number of rows: 216, 220
And using rbind.fill() but it gives me something like
V1 V2 Name USAF NCDC Year Month Day HrMn I Type QCP Temp Q
1 293.494262695312 291.642639160156 <NA> NA NA NA NA NA NA NA <NA> NA <NA> NA
2 294.003479003906 292.375091552734 <NA> NA NA NA NA NA NA NA <NA> NA <NA> NA
3 296.809997558594 295.207885742188 <NA> NA NA NA NA NA NA NA <NA> NA <NA> NA
4 298.287811279297 297.181549072266 <NA> NA NA NA NA NA NA NA <NA> NA <NA> NA
5 298.317565917969 297.725708007813 <NA> NA NA NA NA NA NA NA <NA> NA <NA> NA
6 <NA> <NA> AP 421820 99999 2014 2 1 0 4 FM-12 NA 12 1
7 <NA> <NA> AP 421820 99999 2014 2 1 300 4 FM-12 NA 12.2 1
8 <NA> <NA> AP 421820 99999 2014 2 1 600 4 FM-12 NA 14.4 1
9 <NA> <NA> AP 421820 99999 2014 2 1 900 4 FM-12 NA 18.6 1
10 <NA> <NA> AP 421820 99999 2014 2 1 1200 4 FM-12 NA 18 1
How is it possible to do this in R?
If A and B are the two input data frames, here are some solutions:
1) merge This solutions works regardless of whether A or B has more rows.
merge(data.frame(A, row.names=NULL), data.frame(B, row.names=NULL),
by = 0, all = TRUE)[-1]
The first two arguments could be replaced with just A and B respectively if A and B have default rownames, i.e. 1, 2, ..., or if they have consistent rownames. That is, merge(A, B, by = 0, all = TRUE)[-1] .
For example, if we have this input:
# test inputs
A <- data.frame(BOD, row.names = letters[1:6])
B <- setNames(2 * BOD[1:2, ], c("X", "Y"))
then:
merge(data.frame(A, row.names=NULL), data.frame(B, row.names=NULL),
by = 0, all = TRUE)[-1]
gives:
Time demand X Y
1 1 8.3 2 16.6
2 2 10.3 4 20.6
3 3 19.0 NA NA
4 4 16.0 NA NA
5 5 15.6 NA NA
6 7 19.8 NA NA
1a) An equivalent variation is:
do.call("merge", c(lapply(list(A, B), data.frame, row.names=NULL),
by = 0, all = TRUE))[-1]
2) cbind.zoo This solution assumes that A has more rows and that B's entries are all of the same type, e.g. all numeric. A is not restricted. These conditions hold in the data of the question.
library(zoo)
data.frame(A, cbind(zoo(, 1:nrow(A)), as.zoo(B)))

how to insert many rows

I have this data:
hhid perid actNo thisAct from to tripTime
8019450 1 1 home 180 1051 NA
8019450 1 2 school 1075 1245 24
8019450 1 3 socrec 1255 1260 10
8019450 1 4 home 1280 1619 20
Now, I want to insert three rows in which
thisAct=travel
from=(from-tripTime-1)
to=(from-1)
Then, the expected data look like this:
hhid perid actNo thisAct from to tripTime
8019450 1 1 home 180 1051 NA
*8019450 1 2 travel 1052 1074
8019450 1 3 school 1075 1245 24
*8019450 1 4 travel 1246 1254
8019450 1 5 socrec 1255 1260 10
*8019450 1 6 travel 1261 1279
8019450 1 7 home 1280 1619 20
Could you please how to insert those rows with asterisks?
Thank you.
Start by recreating your data:
dat <- read.table(text="
hhid perid actNo thisAct from to tripTime
c 1 1 home 180 1051 NA
8019450 1 2 school 1075 1245 24
8019450 1 3 socrec 1255 1260 10
8019450 1 4 home 1280 1619 20
", header=TRUE)
Now calculate the travel times and put it in a data frame with the same shape as your data
travel <- data.frame(
hhid = 8019450,
perid = 1,
actNo = NA,
thisAct = "travel",
from = head(dat$to + 1, -1),
to = tail(dat$from - 1, -1),
tripTime = NA
)
then rbind and sort:
x <- rbind(dat, travel)
x <- x[order(x$from), ]
x$perid <- seq_along(x$perid)
x
hhid perid actNo thisAct from to tripTime
1 c 1 1 home 180 1051 NA
5 8019450 2 NA travel 1052 1074 NA
2 8019450 3 2 school 1075 1245 24
6 8019450 4 NA travel 1246 1254 NA
3 8019450 5 3 socrec 1255 1260 10
7 8019450 6 NA travel 1261 1279 NA
4 8019450 7 4 home 1280 1619 20
Your data:
dat <- read.table(text="hhid perid actNo thisAct from to tripTime
8019450 1 1 home 180 1051 NA
8019450 1 2 school 1075 1245 24
8019450 1 3 socrec 1255 1260 10
8019450 1 4 home 1280 1619 20", header = TRUE, stringsAsFactors = FALSE)
This is a way to get what you want:
dat2<- dat[c(1, rep(2:nrow(dat), each = 2)), ]
dat2$actNo <- 1:nrow(dat2)
dat2[c(FALSE, TRUE), "thisAct"] <- "travel"
dat2[c(FALSE, TRUE), "to"] <- dat2[c(FALSE, TRUE), "from"] - 1
dat2[c(FALSE, TRUE), "from"] <- (dat2[c(FALSE, TRUE), "from"] -
dat2[c(FALSE, TRUE), "tripTime"]) + 1
dat2[c(FALSE, TRUE), "tripTime"] <- NA
Since the value inside column tripTime was not specified, I chose NA for the new columns.
The output:
# hhid perid actNo thisAct from to tripTime
# 1 8019450 1 1 home 180 1051 NA
# 2 8019450 1 2 travel 1052 1074 NA
# 2.1 8019450 1 3 school 1075 1245 24
# 3 8019450 1 4 travel 1246 1254 NA
# 3.1 8019450 1 5 socrec 1255 1260 10
# 4 8019450 1 6 travel 1261 1279 NA
# 4.1 8019450 1 7 home 1280 1619 20

Resources