I'm just trying to get a count of occurrences of 'stop' in variable (A) for each of 3 grouping variables (B,C,D).
A B C D
start 1 1 1
start 1 1 1
start 2 1 2
start 2 1 2
stop 1 2 1
stop 1 2 1
stop 2 2 1
Any help would be appreciated - please ask for clarification
I would convert to a data.table:
DT <- as.data.table(DF)
DT[A == 'stop', lapply(.SD, sum), .SDcols=c('B', 'C', 'D')]
B C D
1: 4 6 3
If you were working in the hadleyverse, you could do this using reshape2 and dplyr. Firstly you would use reshape to melt the data so that each of B,C,D has its own row. Then you can group_by and tally as usual.
library(reshape2)
library(dplyr)
melt(df) %>%
filter(A == "stop") %>%
group_by(variable, value) %>%
tally()
# variable value n
# 1 B 1 2
# 2 B 2 1
# 3 C 2 3
# 4 D 1 3
Do you mean each combination of B, C and D? If so here is a base R solution:
df <- read.table(text = "A B C D
start 1 1 1
start 1 1 1
start 2 1 2
start 2 1 2
stop 1 2 1
stop 1 2 1
stop 2 2 1", header = TRUE)
num.stops <- aggregate((A == "stop") ~ B + C + D, df, FUN = sum)
# B C D (A == "stop")
# 1 1 1 1 0
# 2 1 2 1 2
# 3 2 2 1 1
# 4 2 1 2 0
library(dplyr)
df%>%filter(A=='stop')%>%summarise_each(funs(sum),-1)
Related
I'm sure this is super simple but just can't find the answer. I have a data frame like so
Id event
1 1 A
2 1 B
3 1 A
4 1 A
5 2 C
6 2 C
7 2 A
And I'd like to group by Id and collapse the distinct event values while keeping the event order like so
Id event
1 1 A
2 1 B
3 1 A
4 2 C
5 2 A
Most of my searches end up with using the distinct() or unique() functions but that leads losing the A event in row 3 for Id 1.
Thanks in advance!
We can use lead to compare each row and filter those rows that are different than the previous ones. is.na(lead(Id)) is to also include the last rows.
library(dplyr)
dat2 <- dat %>%
filter(!(Id == lead(Id) & event == lead(event)) | is.na(lead(Id)))
dat2
# Id event
# 1 1 A
# 2 1 B
# 3 1 A
# 4 2 C
# 5 2 A
DATA
dat <- read.table(text = " Id event
1 1 A
2 1 B
3 1 A
4 1 A
5 2 C
6 2 C
7 2 A",
header = TRUE, stringsAsFactors = FALSE)
You can just compare every row with the one after it.
df = read.table(text=" Id event
1 1 A
2 1 B
3 1 A
4 1 A
5 2 C
6 2 C
7 2 A",
header=TRUE)
df[rowSums(df[-1,] == head(df, -1)) !=2, ]
Id event
1 1 A
2 1 B
4 1 A
6 2 C
7 2 A
Here is a solution with data.table:
library("data.table")
dt <- fread(
" Id event
1 A
1 B
1 A
1 A
2 C
2 C
2 A")
unique(dt[, r:=rleidv(event), Id])[, -3]
# Id event
# 1: 1 A
# 2: 1 B
# 3: 1 A
# 4: 2 C
# 5: 2 A
or
dt[, .SD[unique(rleidv(event))], by = Id]
(thx to #mt1022 for the comment)
A base R solution using tapply and rle:
x <- tapply(dat$event,dat$Id,function(x) rle(x)$values)
do.call(rbind,Map(data.frame,Id=names(x),event=x))
# Id event
# 1.1 1 A
# 1.2 1 B
# 1.3 1 A
# 2.1 2 C
# 2.2 2 A
I think the distinct function will be able to solve the problem.
dat %>%
distinct(Id, event)
I am trying to modify my data frame:
start end duration_time
1 1 2 2.438
2 2 1 3.901
3 1 2 18.037
4 2 3 85.861
5 3 4 83.922
and create something like this:
start end duration_time weight
1 1 2 20.475 2
2 2 1 3.901 1
4 2 3 85.861 1
5 3 4 83.922 1
So the duplicate start-end combinations will be removed, the weight will raise and duration time will sum
I already have a part working I just can't get the weight to work:
library('plyr')
df <- read.table(header = TRUE, text = "start end duration_time
1 1 2 2.438
2 2 1 3.901
3 1 2 18.037
4 2 3 85.861
5 3 4 83.922")
ddply(df, c("start","end"), summarise, weight=? ,duration_time=sum(duration_time))
A base R option is aggregate
do.call(data.frame, aggregate(duration_time~., df1,
FUN = function(x) c(duration_time=sum(x), weight = length(x))))
Simplest solution using data.table :
library(data.table)
setDT(df)[, .(duration_time=sum(duration_time), wt = .N) , by =c("start", "end")]
start end duration_time wt
1: 1 2 20.475 2
2: 2 1 3.901 1
3: 2 3 85.861 1
4: 3 4 83.922 1
Trying something using dplyr, tidyr
library(dplyr)
library(tidyr)
df1 <- df %>% unite(by_var, start,end)
df2 <- cbind(df1 %>% count(by_var), df1 %>% group_by(by_var)%>%
summarise( duration_time=sum(duration_time))%>%
separate(by_var, c("start","end")))[c(3,4,5,2)]
> df2
start end duration_time n
1 1 2 20.475 2
2 2 1 3.901 1
3 2 3 85.861 1
4 3 4 83.922 1
I have a dataset as follows:
col1 col2
A 1
A 2
A 2
B 1
B 1
C 1
C 1
C 2
I want the output as:
col1 col2 Frequency
A 1 1
A 2 2
B 1 2
C 1 2
C 2 1
I tried using the aggregate function and also the table function but I am unable to get desired result.
You can add a dummy column or use the rownames to aggregate on:
aggregate(rownames(mydf) ~ ., mydf, length)
# col1 col2 rownames(mydf)
# 1 A 1 1
# 2 B 1 2
# 3 C 1 2
# 4 A 2 2
# 5 C 2 1
table also works fine but will report combinations that may not be in your data as "0":
data.frame(table(mydf))
# col1 col2 Freq
# 1 A 1 1
# 2 B 1 2
# 3 C 1 2
# 4 A 2 2
# 5 B 2 0
# 6 C 2 1
Another nice approach is to use "data.table":
library(data.table)
as.data.table(mydf)[, .N, by = names(mydf)]
if your data is
col1 <- c("A","A","A","B","B","C","C","C")
col2 <- c(1,2,2,1,1,1,1,2)
df <- data.frame(col1,col2)
you can use dplyr
1) group_by both both variables, since your output is supposed to include every combination of them
2) count the number of observations for each group using n()
library(dplyr)
df %>% group_by(col1,col2) %>% summarize(frequency=n())
# output
col1 col2 frequency
1 A 1 1
2 A 2 2
3 B 1 2
4 C 1 2
5 C 2 1
I have the following data
> site<-c("A","A","A","B","B","C")
> sample<-c("N","N","N","W","W","S")
> effort<-c(2,2,2,1,1,3)
> y<-c(1,0,1,1,0,1)
> df<-data.frame(site,sample,effort,y)
> df
site sample effort y
1 A N 2 1
2 A N 2 0
3 A N 2 1
4 B W 1 1
5 B W 1 0
6 C S 3 1
And I would like to rearrange to get the minimum "effort" and sum "y" per sample and site.
To end up with the following
site sample effort y
1 A N 2 2
2 B W 1 1
3 C S 3 1
I have tried the following code
tr<-aggregate(.~site+sample,data=df, FUN=function(df) c(m=min(df), n=length(df)))
> tr
site sample effort.m effort.n y.m y.n
1 A N 2 3 0 3
2 C S 3 1 1 1
3 B W 1 2 0 2
This is nearly what I am looking for but is there a better way to do this and how should I deal with zeros in the data?
An answer using powerful dplyr package
library(dplyr)
df %.%
group_by(site,sample) %.%
select(site, sample) %.%
summarise (
mineff = min(effort),
y = sum(y))
site sample mineff y
1 C S 3 1
2 A N 2 2
3 B W 1 1
Using plyr
require(plyr)
ddply(df, c("site", "sample"), summarize,
min_eff = min(effort), sum_y = sum(y))
site sample min_eff sum_y
1 A N 2 2
2 B W 1 1
3 C S 3 1
In your example, there's a one-to-one correspondence between site and sample. This approach will work for every pairwise distinct combination. As to
How should I deal with zeros in the data?
How do you want to deal with them? What are your concerns?
This question already has answers here:
Numbering rows within groups in a data frame
(10 answers)
Closed 6 years ago.
I have my data that looks like below:
CustomerID TripDate
1 1/3/2013
1 1/4/2013
1 1/9/2013
2 2/1/2013
2 2/4/2013
3 1/2/2013
I need to create a counter variable, which will be like below:
CustomerID TripDate TripCounter
1 1/3/2013 1
1 1/4/2013 2
1 1/9/2013 3
2 2/1/2013 1
2 2/4/2013 2
3 1/2/2013 1
Tripcounter will be for each customer.
Use ave. Assuming your data.frame is called "mydf":
mydf$counter <- with(mydf, ave(CustomerID, CustomerID, FUN = seq_along))
mydf
# CustomerID TripDate counter
# 1 1 1/3/2013 1
# 2 1 1/4/2013 2
# 3 1 1/9/2013 3
# 4 2 2/1/2013 1
# 5 2 2/4/2013 2
# 6 3 1/2/2013 1
For what it's worth, I also implemented a version of this approach in a function included in my "splitstackshape" package. The function is called getanID:
mydf <- data.frame(IDA = c("a", "a", "a", "b", "b", "b", "b"),
IDB = c(1, 2, 1, 1, 2, 2, 2), values = 1:7)
mydf
# install.packages("splitstackshape")
library(splitstackshape)
# getanID(mydf, id.vars = c("IDA", "IDB"))
getanID(mydf, id.vars = 1:2)
# IDA IDB values .id
# 1 a 1 1 1
# 2 a 2 2 1
# 3 a 1 3 2
# 4 b 1 4 1
# 5 b 2 5 1
# 6 b 2 6 2
# 7 b 2 7 3
As you can see from the example above, I've written the function in such a way that you can specify one or more columns that should be treated as ID columns. It checks to see if any of the id.vars are duplicated, and if they are, then it generates a new ID variable for you.
You can also use plyr for this (using #AnadaMahto's example data):
> ddply(mydf, .(IDA), transform, .id = seq_along(IDA))
IDA IDB values .id
1 a 1 1 1
2 a 2 2 2
3 a 1 3 3
4 b 1 4 1
5 b 2 5 2
6 b 2 6 3
7 b 2 7 4
or even:
> ddply(mydf, .(IDA, IDB), transform, .id = seq_along(IDA))
IDA IDB values .id
1 a 1 1 1
2 a 1 3 2
3 a 2 2 1
4 b 1 4 1
5 b 2 5 1
6 b 2 6 2
7 b 2 7 3
Note that plyr does not have a reputation for being the quickest solution, for that you need to take a look at data.table.
Here's a data.table approach:
library(data.table)
DT <- data.table(mydf)
DT[, .id := sequence(.N), by = "IDA,IDB"]
DT
# IDA IDB values .id
# 1: a 1 1 1
# 2: a 2 2 1
# 3: a 1 3 2
# 4: b 1 4 1
# 5: b 2 5 1
# 6: b 2 6 2
# 7: b 2 7 3
meanwhile, you can also use dplyr. if your data.frame is called mydata
library(dplyr)
mydata %>% group_by(CustomerID) %>% mutate(TripCounter = row_number())
I need to do this often, and wrote a function that accomplishes it differently than the previous answers. I am not sure which solution is most efficient.
idCounter <- function(x) {
unlist(lapply(rle(x)$lengths, seq_len))
}
mydf$TripCounter <- idCounter(mydf$CustomerID)
Here's the procedure styled code. I dont believe in things like if you are using loop in R then you are probably doing something wrong
x <- dataframe$CustomerID
dataframe$counter <- 0
y <- dataframe$counter
count <- 1
for (i in 1:length(x)) {
ifelse (x[i] == x[i-1], count <- count + 1, count <- 1 )
y[i] <- count
}
dataframe$counter <- y
This isn't the right answer but showing some interesting things comparing to for loops, vectorization is fast does not care about sequential updating.
a<-read.table(textConnection(
"CustomerID TripDate
1 1/3/2013
1 1/4/2013
1 1/9/2013
2 2/1/2013
2 2/4/2013
3 1/2/2013 "), header=TRUE)
a <- a %>%
group_by(CustomerID,TripDate) # must in order
res <- rep(1, nrow(a)) #base # 1
res[2:6] <-sapply(2:6, function(i)if(a$CustomerID[i]== a$CustomerID[i - 1]) {res[i] = res[i-1]+1} else {res[i]= res[i]})
a$TripeCounter <- res