Append frequency value from data - r

I have a data set that look like this
XDATA
SAMPN HHSIZE TOTVEH
1 2 3
2 6 4
2 6 4
5 1 3
5 1 3
5 1 3
How can i add an extra column for lets say SAMPN frequency so i can look like this:
XDATA
SAMPN HHSIZE TOTVEH FREQ
1 2 3 1
2 6 4 2
2 6 4 2
5 1 3 3
5 1 3 3
5 1 3 3
Thanks in advance

library(data.table)
XDATA <- data.table(XDATA)
XDATA[, FREQ := .N, by=SAMPN]
XDATA
SAMPN HHSIZE TOTVEH FREQ
1: 1 2 3 1
2: 2 6 4 2
3: 2 6 4 2
4: 5 1 3 3
5: 5 1 3 3
6: 5 1 3 3
>
For BASE R - see #Ananda Mahto's solution

An alternative to #Ricardo's answer for base R (which uses tapply and merge) is to use ave:
within(XDATA, {
FREQ <- ave(SAMPN, SAMPN, FUN = length)
})
# SAMPN HHSIZE TOTVEH FREQ
# 1 1 2 3 1
# 2 2 6 4 2
# 3 2 6 4 2
# 4 5 1 5 3
# 5 5 1 5 3
# 6 5 1 5 3

XDATA <- data.table(
SAMPN = c(1,2,2,5,5,5),
HHSIZE = c(2,6,6,1,1,1),
TOTVEH = c(3,4,4,5,5,5)
)
XDATA[, COUNT := 1]
XDATA[, FREQ := sum(COUNT), by = c('SAMPN','HHSIZE','TOTVEH')]
#SAMPN HHSIZE TOTVEH COUNT FREQ
#1: 1 2 3 1 1
#2: 2 6 4 1 2
#3: 2 6 4 1 2
#4: 5 1 5 1 3
#5: 5 1 5 1 3
#6: 5 1 5 1 3

Related

identify whenever values repeat in r

I have a dataframe like this.
data <- data.frame(Condition = c(1,1,2,3,1,1,2,2,2,3,1,1,2,3,3))
I want to populate a new variable Sequence which identifies whenever Condition starts again from 1.
So the new dataframe would look like this.
Thanks in advance for the help!
data <- data.frame(Condition = c(1,1,2,3,1,1,2,2,2,3,1,1,2,3,3),
Sequence = c(1,1,1,1,2,2,2,2,2,2,3,3,3,3,3))
base R
data$Sequence2 <- cumsum(c(TRUE, data$Condition[-1] == 1 & data$Condition[-nrow(data)] != 1))
data
# Condition Sequence Sequence2
# 1 1 1 1
# 2 1 1 1
# 3 2 1 1
# 4 3 1 1
# 5 1 2 2
# 6 1 2 2
# 7 2 2 2
# 8 2 2 2
# 9 2 2 2
# 10 3 2 2
# 11 1 3 3
# 12 1 3 3
# 13 2 3 3
# 14 3 3 3
# 15 3 3 3
dplyr
library(dplyr)
data %>%
mutate(
Sequence2 = cumsum(Condition == 1 & lag(Condition != 1, default = TRUE))
)
# Condition Sequence Sequence2
# 1 1 1 1
# 2 1 1 1
# 3 2 1 1
# 4 3 1 1
# 5 1 2 2
# 6 1 2 2
# 7 2 2 2
# 8 2 2 2
# 9 2 2 2
# 10 3 2 2
# 11 1 3 3
# 12 1 3 3
# 13 2 3 3
# 14 3 3 3
# 15 3 3 3
This took a while. Finally I find this solution:
library(dplyr)
data %>%
group_by(Sequnce = cumsum(
ifelse(Condition==1, lead(Condition)+1, Condition)
- Condition==1)
)
Condition Sequnce
<dbl> <int>
1 1 1
2 1 1
3 2 1
4 3 1
5 1 2
6 1 2
7 2 2
8 2 2
9 2 2
10 3 2
11 1 3
12 1 3
13 2 3
14 3 3
15 3 3

How to create a count up data frame

I’m trying to create a data frame in r that looks like this
X Y Z
3 1 1
3 1 2
3 1 3
3 2 1
3 2 2
3 2 3
4 1 1
4 1 2
4 1 3
4 2 1
...
So column z counts up to 3 then when it reaches 3 column y increments by 1 and z counts up again until 3. Then x increments by 1 and the process starts again
You could use expand.grid + rev
rev(expand.grid(z = 1:3, y = 1:2, x = 3:4))
x y z
1 3 1 1
2 3 1 2
3 3 1 3
4 3 2 1
5 3 2 2
6 3 2 3
7 4 1 1
8 4 1 2
9 4 1 3
10 4 2 1
11 4 2 2
12 4 2 3
An option is to use tidyr::crossing().
In your case:
crossing(X = 3:4,
Y = 1:2,
Z = 1:3)
data.frame(X=rep(3:4,each=6,1),
Y=rep(1:2,each=3,2),
Z=rep(1:3,each=1,4))
Here is another base R solution in addition to the expand.grid approach by #Onyambu.
The feature of this code below is that, you only need to put everything into the list lst, and pass it to function f:
f <- function(lst) data.frame(mapply(function(p,n) rep(p,each=n),lst, prod(lengths(lst))/cumprod(lengths(lst))))
lst<- list(x = 3:4,y = 1:2,z = 1:3)
res <- f(lst)
such that
> res
x y z
1 3 1 1
2 3 1 2
3 3 1 3
4 3 2 1
5 3 2 2
6 3 2 3
7 4 1 1
8 4 1 2
9 4 1 3
10 4 2 1
11 4 2 2
12 4 2 3
A data.table solution for completness:
data.table::CJ(x = 3:4, y = 1:2, z = 1:3)
x y z
1: 3 1 1
2: 3 1 2
3: 3 1 3
4: 3 2 1
5: 3 2 2
6: 3 2 3
7: 4 1 1
8: 4 1 2
9: 4 1 3
10: 4 2 1
11: 4 2 2
12: 4 2 3

Shifting a variable up in R data frame with respect to another variable

I have a data.frame:
ID <-c(2,2,2,2,3,3,5,5)
Pur<-c(0,1,2,3,1,2,4,5)
df<-data.frame(ID,Pur)
I would like to push the Pur up for each ID to get the up.Pur as follows:
ID Pur up.Pur
2 0 1
2 1 2
2 2 3
2 3 NA
3 1 2
3 2 NA
5 4 5
5 5 NA
Would appreciate your help with this.
Here is a dplyr approach
library(dplyr)
ID <-c(2,2,2,2,3,3,5,5)
Pur<-c(0,1,2,3,1,2,4,5)
df<-data.frame(ID,Pur)
df %>%
group_by(ID) %>%
mutate(up.Pur = lead(Pur))
# Source: local data frame [8 x 3]
# Groups: ID [3]
#
# ID Pur up.Pur
# <dbl> <dbl> <dbl>
# 1 2 0 1
# 2 2 1 2
# 3 2 2 3
# 4 2 3 NA
# 5 3 1 2
# 6 3 2 NA
# 7 5 4 5
# 8 5 5 NA
For completeness, I've added a base R approach, just in case you don't feel like installing any packages.
dfList = split(df, ID)
dfList = lapply(dfList, function(x){
x$up.Pur = c(x$Pur[-1], NA)
return(x)
})
unsplit(dfList, ID)
# ID Pur up.Pur
# 1 2 0 1
# 2 2 1 2
# 3 2 2 3
# 4 2 3 NA
# 5 3 1 2
# 6 3 2 NA
# 7 5 4 5
# 8 5 5 NA
We can use shift from data.table
library(data.table)
setDT(df)[, up.Pur := shift(Pur, type = "lead"), by = ID]
df
# ID Pur up.Pur
#1: 2 0 1
#2: 2 1 2
#3: 2 2 3
#4: 2 3 NA
#5: 3 1 2
#6: 3 2 NA
#7: 5 4 5
#8: 5 5 NA

Count with table() and exclude 0's

I try to count triplets; for this I use three vectors that are packed in a dataframe:
X=c(4,4,4,4,4,4,4,4,1,1,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,3,3)
Y=c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,3,4,2,2,2,2,3,4,1,1,2,2,3,3,4,4)
Z=c(4,4,5,4,4,4,4,4,6,1,1,1,1,1,1,1,2,2,2,2,7,2,3,3,3,3,3,3,3,3)
Count_Frame=data.frame(matrix(NA, nrow=(length(X)), ncol=3))
Count_Frame[1]=X
Count_Frame[2]=Y
Count_Frame[3]=Z
Counts=data.frame(table(Count_Frame))
There is the following problem: if I increase the value range in the vectors or use even more vectors the "Counts" dataframe quickly approaches its size limit due to the many 0-counts. Is there a way to exclude the 0-counts while generating "Counts"?
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(Count_Frame)), grouped by all the columns (.(X, Y, Z)), we get the number or rows (.N).
library(data.table)
setDT(Count_Frame)[,.N ,.(X, Y, Z)]
# X Y Z N
# 1: 4 1 4 7
# 2: 4 1 5 1
# 3: 1 1 6 1
# 4: 1 1 1 3
# 5: 1 2 1 2
# 6: 1 3 1 1
# 7: 1 4 1 1
# 8: 2 2 2 4
# 9: 2 3 7 1
#10: 2 4 2 1
#11: 3 1 3 2
#12: 3 2 3 2
#13: 3 3 3 2
#14: 3 4 3 2
Instead of naming all the columns, we can use names(Count_Frame) as well (if there are many columns)
setDT(Count_Frame)[,.N , names(Count_Frame)]
You can accomplish this with aggregate:
Count_Frame$one <- 1
aggregate(one ~ X1 + X2 + X3, data=Count_Frame, FUN=sum)
This will calculate the positive instances of table, but will not list the zero counts.
One solution is to create a combination of the column values and count those instead:
library(tidyr)
as.data.frame(table(unite(Count_Frame, tmp, X1, X2, X3))) %>%
separate(Var1, c('X1', 'X2', 'X3'))
Resulting output is:
X1 X2 X3 Freq
1 1 1 1 3
2 1 1 6 1
3 1 2 1 2
4 1 3 1 1
5 1 4 1 1
6 2 2 2 4
7 2 3 7 1
8 2 4 2 1
9 3 1 3 2
10 3 2 3 2
11 3 3 3 2
12 3 4 3 2
13 4 1 4 7
14 4 1 5 1
Or using plyr:
library(plyr)
count(Count_Frame, colnames(Count_Frame))
output
# > count(Count_Frame, colnames(Count_Frame))
# X1 X2 X3 freq
# 1 1 1 1 3
# 2 1 1 6 1
# 3 1 2 1 2
# 4 1 3 1 1
# 5 1 4 1 1
# 6 2 2 2 4
# 7 2 3 7 1
# 8 2 4 2 1
# 9 3 1 3 2
# 10 3 2 3 2
# 11 3 3 3 2
# 12 3 4 3 2
# 13 4 1 4 7
# 14 4 1 5 1

Combining an individual and aggregate level data sets

I've got two different data frames, lets call them "Months" and "People".
Months looks like this:
Month Site X
1 1 4
2 1 3
3 1 5
1 2 10
2 2 7
3 2 5
and People looks like this:
ID Month Site
1 1 1
2 1 2
3 1 1
4 2 2
5 2 2
6 2 2
7 3 1
8 3 2
I'd like to combine them so essentially each time an entry in "People" has a particular Month and Site combination, it's added to the appropriate aggregated data frame, so I'd get something like the following:
Month Site X People
1 1 4 2
2 1 3 0
3 1 5 1
1 2 10 1
2 2 7 3
3 2 5 1
But I haven't the foggiest idea of how to go about doing that. Any suggestions?
Using base packages
> aggregate( ID ~ Month + Site, data=People, FUN = length )
Month Site ID
1 1 1 2
2 3 1 1
3 1 2 1
4 2 2 3
5 3 2 1
> res <- merge(Months, aggdata, all.x = TRUE)
> res
Month Site X ID
1 1 1 4 2
2 1 2 10 1
3 2 1 3 NA
4 2 2 7 3
5 3 1 5 1
6 3 2 5 1
> res[is.na(res)] <- 0
> res
Month Site X ID
1 1 1 4 2
2 1 2 10 1
3 2 1 3 0
4 2 2 7 3
5 3 1 5 1
6 3 2 5 1
Assuming your data.frames are months and people, here's a data.table solution:
require(data.table)
m.dt <- data.table(months, key=c("Month", "Site"))
p.dt <- data.table(people, key=c("Month", "Site"))
# one-liner
dt.f <- p.dt[m.dt, list(X=X[1], People=sum(!is.na(ID)))]
> dt.f
# Month Site X People
# 1: 1 1 4 2
# 2: 1 2 10 1
# 3: 2 1 3 0
# 4: 2 2 7 3
# 5: 3 1 5 1
# 6: 3 2 5 1

Resources