R: Calculating New Variable R Code - r

I have
id_1 id_2 name count total
1 001 111 a 15
2 001 111 b 3
3 001 111 sum 28 28
4 002 111 a 7
5 002 111 b 33
6 002 111 sum 48 48
I want the rows that share the same id_1 and id_2 to share the total, like
id_1 id_2 name count total
1 001 111 a 15 28
2 001 111 b 3 28
3 001 111 sum 28 28
4 002 111 a 7 48
5 002 111 b 33 48
6 002 111 sum 48 48

We can use fill from tidyr.
library(tidyr)
dat2 <- dat %>% fill(total, .direction = "up")
dat2
# id_1 id_2 name count total
# 1 1 111 a 15 28
# 2 1 111 b 3 28
# 3 1 111 sum 28 28
# 4 2 111 a 7 48
# 5 2 111 b 33 48
# 6 2 111 sum 48 48
DATA
dat <- read.table(text = " id_1 id_2 name count total
1 001 111 a 15 NA
2 001 111 b 3 NA
3 001 111 sum 28 28
4 002 111 a 7 NA
5 002 111 b 33 NA
6 002 111 sum 48 48",
header = TRUE, stringsAsFactors = FALSE)

Consider base R's ave calculating group max (na.rm to handle NA):
df$total <- ave(df$total, df$id_1, df$_id_2, FUN=function(i) max(i, na.rm=na.omit))
df
# id_1 id_2 name count total
# 1 1 111 a 15 28
# 2 1 111 b 3 28
# 3 1 111 sum 28 28
# 4 2 111 a 7 48
# 5 2 111 b 33 48
# 6 2 111 sum 48 48

Using zoo and data.table:
df <- read.table(text = "id_1 id_2 name count total
001 111 a 15 NA
001 111 b 3 NA
001 111 sum 28 28
002 111 a 7 NA
002 111 b 33 NA
002 111 sum 48 48",
header = TRUE, stringsAsFactors = FALSE)# create data
library(zoo)# load packages
library(data.table)
setDT(df)[, total := na.locf(na.locf(total, na.rm=FALSE), na.rm=FALSE, fromLast=TRUE), by = c("id_1", "id_2")]# convert df to data.table and carry forward and backward total by ids
Output:
id_1 id_2 name count total
1: 1 111 a 15 28
2: 1 111 b 3 28
3: 1 111 sum 28 28
4: 2 111 a 7 48
5: 2 111 b 33 48
6: 2 111 sum 48 48

Simple approach using the normal dplyr way:
dat %>% group_by(id_1, id_2) %>% mutate(total=count[name == "sum"])
Alternatively:
dat %>% group_by(id_1, id_2) %>% mutate(total=na.omit(total)[1])
id_1 id_2 name count total
<int> <int> <chr> <int> <int>
1 1 111 a 15 28
2 1 111 b 3 28
3 1 111 sum 28 28
4 2 111 a 7 48
5 2 111 b 33 48
6 2 111 sum 48 48

Related

How to find common items in two data frames in R?

Below is the sample code. Seems simple enough but what I find online makes this far too complex. I am simply wanting to create a third data frame that contains one column.. the areas that common to firstdf and seconddf.
area1 <- c("001","005","007","009","011","013","015","017","019","021","023","025")
Employment1 <- c(2,4,6,8,110,12,14,15,16,17,12,20)
firstdf <- data.frame(area1,Employment1)
area2 <- c("005","007","011","013","015","021","027","033")
Employment2 <- c(100,101,102,103,104,111,321,522)
seconddf <- data.frame(area2,Employment2)
intended result
thirddf =>
area
005
007
011
013
015
021
We can use intersect:
data.frame(area = intersect(firstdf$area1, seconddf$area2))
# area
# 1 005
# 2 007
# 3 011
# 4 013
# 5 015
# 6 021
though there's no need for the frame, it can easily be intersect(..) without the wrapping data.frame(.).
Candidly, though, it is common to want to preserve the other columns from either or both frames. Some thoughts:
### rows where `area` is in both
merge(firstdf, seconddf, by.x = "area1", by.y = "area2")
# area1 Employment1 Employment2
# 1 005 4 100
# 2 007 6 101
# 3 011 110 102
# 4 013 12 103
# 5 015 14 104
# 6 021 17 111
### always keep first frame
merge(firstdf, seconddf, by.x = "area1", by.y = "area2", all.x = TRUE)
# area1 Employment1 Employment2
# 1 001 2 NA
# 2 005 4 100
# 3 007 6 101
# 4 009 8 NA
# 5 011 110 102
# 6 013 12 103
# 7 015 14 104
# 8 017 15 NA
# 9 019 16 NA
# 10 021 17 111
# 11 023 12 NA
# 12 025 20 NA
### always keep second frame
merge(firstdf, seconddf, by.x = "area1", by.y = "area2", all.y = TRUE)
# area1 Employment1 Employment2
# 1 005 4 100
# 2 007 6 101
# 3 011 110 102
# 4 013 12 103
# 5 015 14 104
# 6 021 17 111
# 7 027 NA 321
# 8 033 NA 522
### keep all rows, NAs abound
merge(firstdf, seconddf, by.x = "area1", by.y = "area2", all = TRUE)
# area1 Employment1 Employment2
# 1 001 2 NA
# 2 005 4 100
# 3 007 6 101
# 4 009 8 NA
# 5 011 110 102
# 6 013 12 103
# 7 015 14 104
# 8 017 15 NA
# 9 019 16 NA
# 10 021 17 111
# 11 023 12 NA
# 12 025 20 NA
# 13 027 NA 321
# 14 033 NA 522
There are dplyr-variants of the merge(..) calls above that might be easier to read if you're already learning/using that dialect. The joining is over-kill if you never want all of the other columns, but I offer it in case that's the direction you're headed.
library(dplyr)
tibble(intersect(firstdf$area1, seconddf$area2))
1 005
2 007
3 011
4 013
5 015
6 021

How to delete missing observations for a subset of columns: the R equivalent of dropna(subset) from python pandas

Consider a dataframe in R where I want to drop row 6 because it has missing observations for the variables var1:var3. But the dataframe has valid observations for id and year. See code below.
In python, this can be done in two ways:
use df.dropna(subset = ['var1', 'var2', 'var3'], inplace=True)
use df.set_index(['id', 'year']).dropna()
How to do this in R with tidyverse?
library(tidyverse)
df <- tibble(id = c(seq(1,10)), year=c(seq(2001,2010)),
var1 = c(sample(1:100, 10, replace=TRUE)),
var2 = c(sample(1:100, 10, replace=TRUE)),
var3 = c(sample(1:100, 10, replace=TRUE)))
df[3,4] = NA
df[6,3:5] = NA
df[8,3:4] = NA
df[10,4:5] = NA
We may use complete.cases
library(dplyr)
df %>%
filter(if_any(var1:var3, complete.cases))
-output
# A tibble: 9 x 5
id year var1 var2 var3
<int> <int> <int> <int> <int>
1 1 2001 48 55 82
2 2 2002 22 83 67
3 3 2003 89 NA 19
4 4 2004 56 1 38
5 5 2005 17 58 35
6 7 2007 4 30 94
7 8 2008 NA NA 36
8 9 2009 97 100 80
9 10 2010 37 NA NA
We can use pmap for this case also:
library(dplyr)
library(purrr)
df %>%
filter(!pmap_lgl(., ~ {x <- c(...)[-c(1, 2)];
all(is.na(x))}))
# A tibble: 9 x 5
id year var1 var2 var3
<int> <int> <int> <int> <int>
1 1 2001 90 55 77
2 2 2002 77 5 18
3 3 2003 17 NA 70
4 4 2004 72 33 33
5 5 2005 10 55 77
6 7 2007 22 81 17
7 8 2008 NA NA 46
8 9 2009 93 28 100
9 10 2010 50 NA NA
Or we could also use complete.cases function in pmap as suggested by dear #akrun:
df %>%
filter(pmap_lgl(select(., 3:5), ~ any(complete.cases(c(...)))))
You can use if_any in filter -
library(dplyr)
df %>% filter(if_any(var1:var3, Negate(is.na)))
# id year var1 var2 var3
# <int> <int> <int> <int> <int>
#1 1 2001 14 99 43
#2 2 2002 25 72 76
#3 3 2003 90 NA 15
#4 4 2004 91 7 32
#5 5 2005 69 42 7
#6 7 2007 57 83 41
#7 8 2008 NA NA 74
#8 9 2009 9 78 23
#9 10 2010 93 NA NA
In base R, we can use rowSums to select rows which has atleast 1 non-NA value.
cols <- grep('var', names(df))
df[rowSums(!is.na(df[cols])) > 0, ]
If looking for complete cases, use the following (kernel of this is based on other answers):
library(tidyverse)
df <- tibble(id = c(seq(1,10)), year=c(seq(2001,2010)),
var1 = c(sample(1:100, 10, replace=TRUE)),
var2 = c(sample(1:100, 10, replace=TRUE)),
var3 = c(sample(1:100, 10, replace=TRUE)))
df[3,4] = NA
df[6,3:5] = NA
df[8,3:4] = NA
df[10,4:5] = NA
df %>% filter(!if_any(var1:var3, is.na))
#> # A tibble: 6 x 5
#> id year var1 var2 var3
#> <int> <int> <int> <int> <int>
#> 1 1 2001 13 28 26
#> 2 2 2002 61 77 58
#> 3 4 2004 95 38 58
#> 4 5 2005 38 34 91
#> 5 7 2007 85 46 14
#> 6 9 2009 45 60 40
Created on 2021-06-24 by the reprex package (v2.0.0)

How to restructure data with one observation by row into data with one observation by ID (and multiple columns) in R?

Let's say I have a dataframe with 3 ID columns and one column of interest. Each row represents one observation. Some ID have multiple observations, i.e., multiple rows.
df <- data.frame(id1 = c( 1, 2, 3, 4, 4),
id2 = c( 11, 12, 13, 14, 14),
id3 = c(111, 112, 113, 114, 114),
variable_of_interest = c(13, 24, 35, 31, 12))
id1 id2 id3 variable_of_interest
1 1 11 111 13
2 2 12 112 24
3 3 13 113 35
4 4 14 114 31
5 4 14 114 12
My goal is to restructure it in odred to have one row per ID, to keep the 3 IDs and to name the new columns "variable_of_interest1", "variable_of_interest2":
id1 id2 id3 variable_of_interest1 variable_of_interest1
1 1 11 111 13 NA
2 2 12 112 24 NA
3 3 13 113 35 NA
4 4 14 114 31 12
The solution might need reshape2 and the dcast function, but until now, I could not solve this out.
We can create a sequence grouped by the 'id' columns and then with pivot_wider reshape to wide
library(dplyr)
library(stringr)
library(tidyr)
library(data.table)
df %>%
mutate(ind = str_c('variable_of_interest', rowid(id1, id2, id3))) %>%
pivot_wider(names_from = ind, values_from = variable_of_interest)
-output
# A tibble: 4 x 5
# id1 id2 id3 variable_of_interest1 variable_of_interest2
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 11 111 13 NA
#2 2 12 112 24 NA
#3 3 13 113 35 NA
#4 4 14 114 31 12
Or another option is data.table
library(data.table)
dcast(setDT(df), id1 + id2 + id3 ~
paste0('variable_of_interest', rowid(id1, id2, id3)),
value.var = 'variable_of_interest')
-output
# id1 id2 id3 variable_of_interest1 variable_of_interest2
#1: 1 11 111 13 NA
#2: 2 12 112 24 NA
#3: 3 13 113 35 NA
#4: 4 14 114 31 12

Cleaning a data.frame in a semi-reshape/semi-aggregate fashion

First time posting something here, forgive any missteps in my question.
In my example below I've got a data.frame where the unique identifier is the tripID with the name of the vessel, the species code, and a catch metric.
> testFrame1 <- data.frame('tripID' = c(1,1,2,2,3,4,5),
'name' = c('SS Anne','SS Anne', 'HMS Endurance', 'HMS Endurance','Salty Hippo', 'Seagallop', 'Borealis'),
'SPP' = c(101,201,101,201,102,102,103),
'kept' = c(12, 22, 14, 24, 16, 18, 10))
> testFrame1
tripID name SPP kept
1 1 SS Anne 101 12
2 1 SS Anne 201 22
3 2 HMS Endurance 101 14
4 2 HMS Endurance 201 24
5 3 Salty Hippo 102 16
6 4 Seagallop 102 18
7 5 Borealis 103 10
I need a way to basically condense the data.frame so that all there is only one row per tripID as shown below.
> testFrame1
tripID name SPP kept SPP.1 kept.1
1 1 SS Anne 101 12 201 22
2 2 HMS Endurance 101 14 201 24
3 3 Salty Hippo 102 16 NA NA
4 4 Seagallop 102 18 NA NA
5 5 Borealis 103 10 NA NA
I've looked into tidyr and reshape but neither of those are can deliver quite what I'm asking for. Is there anything out there that does this quasi-reshaping?
Here are two alternatives using base::reshape and data.table::dcast:
1) base R
reshape(transform(testFrame1,
timevar = ave(tripID, tripID, FUN = seq_along)),
idvar = cbind("tripID", "name"),
timevar = "timevar",
direction = "wide")
# tripID name SPP.1 kept.1 SPP.2 kept.2
#1 1 SS Anne 101 12 201 22
#3 2 HMS Endurance 101 14 201 24
#5 3 Salty Hippo 102 16 NA NA
#6 4 Seagallop 102 18 NA NA
#7 5 Borealis 103 10 NA NA
2) data.table
library(data.table)
setDT(testFrame1)
dcast(testFrame1, tripID + name ~ rowid(tripID), value.var = c("SPP", "kept"))
# tripID name SPP_1 SPP_2 kept_1 kept_2
#1: 1 SS Anne 101 201 12 22
#2: 2 HMS Endurance 101 201 14 24
#3: 3 Salty Hippo 102 NA 16 NA
#4: 4 Seagallop 102 NA 18 NA
#5: 5 Borealis 103 NA 10 NA
Great reproducible post considering it's your first. Here's a way to do it with dplyr and tidyr -
testFrame1 %>%
group_by(tripID, name) %>%
summarise(
SPP = toString(SPP),
kept = toString(kept)
) %>%
ungroup() %>%
separate("SPP", into = c("SPP", "SPP.1"), sep = ", ", extra = "drop", fill = "right") %>%
separate("kept", into = c("kept", "kept.1"), sep = ", ", extra = "drop", fill = "right")
# A tibble: 5 x 6
tripID name SPP SPP.1 kept kept.1
<dbl> <chr> <chr> <chr> <chr> <chr>
1 1.00 SS Anne 101 201 12 22
2 2.00 HMS Endurance 101 201 14 24
3 3.00 Salty Hippo 102 <NA> 16 <NA>
4 4.00 Seagallop 102 <NA> 18 <NA>
5 5.00 Borealis 103 <NA> 10 <NA>

How to remove subjects with missing yearly observations in R?

num Name year age X
1 1 A 2011 68 116292
2 1 A 2012 69 46132
3 1 A 2013 70 7042
4 1 A 2014 71 -100425
5 1 A 2015 72 6493
6 2 B 2011 20 -8484
7 3 C 2015 23 -120836
8 4 D 2011 3 -26523
9 4 D 2012 4 9923
10 4 D 2013 5 82432
I have the data which is represented by various subjects in 5 years. I need to remove all the subjects, which are missing any of years from 2011 to 2015. How can I accomplish it, so in given data only subject A is left?
Using data.table:
A data.table solution might look something like this:
library(data.table)
dt <- as.data.table(df)
dt[, keep := identical(unique(year), 2011:2015), by = Name ][keep == T, ][,keep := NULL]
# num Name year age X
#1: 1 A 2011 68 116292
#2: 1 A 2012 69 46132
#3: 1 A 2013 70 7042
#4: 1 A 2014 71 -100425
#5: 1 A 2015 72 6493
This is more strict in that it requires that the unique years be exactly equal to 2011:2015. If there is a 2016, for example that person would be excluded.
A less restrictive solution would be to check that 2011:2015 is in your unique years. This should work:
dt[, keep := all(2011:2015 %in% unique(year)), by = Name ][keep == T, ][,keep := NULL]
Thus, if for example, A had a 2016 year and a 2010 year it would still keep all of A. But if anyone is missing a year in 2011:2015 this would exclude them.
Using base R & aggregate:
Same option, but using aggregate from base R:
agg <- aggregate(df$year, by = list(df$Name), FUN = function(x) all(2011:2015 %in% unique(x)))
df[df$Name %in% agg[agg$x == T, 1] ,]
Here is a slightly more straightforward tidyverse solution.
First, expand the dataframe to include all combinations of Name + year:
df %>% complete(Name, year)
# A tibble: 20 x 5
Name year num age X
<fctr> <int> <int> <int> <int>
1 A 2011 1 68 116292
2 A 2012 1 69 46132
3 A 2013 1 70 7042
4 A 2014 1 71 -100425
5 A 2015 1 72 6493
6 B 2011 2 20 -8484
7 B 2012 NA NA NA
8 B 2013 NA NA NA
9 B 2014 NA NA NA
10 B 2015 NA NA NA
...
Then extend the pipe to group by "Name", and filter to keep only those with 0 NA values:
df %>% complete(Name, year) %>%
group_by(Name) %>%
filter(sum(is.na(age)) == 0)
# A tibble: 5 x 5
# Groups: Name [1]
Name year num age X
<fctr> <int> <int> <int> <int>
1 A 2011 1 68 116292
2 A 2012 1 69 46132
3 A 2013 1 70 7042
4 A 2014 1 71 -100425
5 A 2015 1 72 6493
Just check which names have the right number of entries.
## Reproduce your data
df = read.table(text=" num Name year age X
1 1 A 2011 68 116292
2 1 A 2012 69 46132
3 1 A 2013 70 7042
4 1 A 2014 71 -100425
5 1 A 2015 72 6493
6 2 B 2011 20 -8484
7 3 C 2015 23 -120836
8 4 D 2011 3 -26523
9 4 D 2012 4 9923
10 4 D 2013 5 82432",
header=TRUE)
Tab = table(df$Name)
Keepers = names(Tab)[which(Tab == 5)]
df[df$Name %in% Keepers,]
num Name year age X
1 1 A 2011 68 116292
2 1 A 2012 69 46132
3 1 A 2013 70 7042
4 1 A 2014 71 -100425
5 1 A 2015 72 6493
Here is a somewhat different approach using tidyverse packages:
library(tidyverse)
df <- read.table(text = " num Name year age X
1 1 A 2011 68 116292
2 1 A 2012 69 46132
3 1 A 2013 70 7042
4 1 A 2014 71 -100425
5 1 A 2015 72 6493
6 2 B 2011 20 -8484
7 3 C 2015 23 -120836
8 4 D 2011 3 -26523
9 4 D 2012 4 9923
10 4 D 2013 5 82432")
df2 <- spread(data = df, key = Name, value = year)
x <- colSums(df2[, 4:7], na.rm = TRUE) > 10000
df3 <- select(df2, num, age, X, c(4:7)[x])
df4 <- na.omit(df3)
All steps can of course be constructed as one single pipe with the %>% operator.

Resources