How to do a binary if statement in R? - r

I have a data set with a column of letters, followed by another column of ones and zeroes. I want to total the amount of "ones" for each letter, but am unsure how to do so in an effective way.
I appreciate the help.

We can group by the first column ('col1') and then get the sum of 'col2'
library(dplyr)
df1 %>%
group_by(col1) %>%
summarise(Total = sum(col2))
Or in data.table
library(data.table)
setDT(df1)[, .(Total = sum(col2)), col1]
Or with base R
rowsum(df1$col2, df1$col1)

Here are some other base R solutions
> tapply(df$col2, df$col1, sum)
a b c
1 1 2
> xtabs(col2~col1,df)
col1
a b c
1 1 2
Dummy Data
df <- structure(list(col1 = structure(c(1L, 3L, 1L, 2L, 1L, 3L, 3L,
2L, 2L, 3L), .Label = c("a", "b", "c"), class = "factor"), col2 = c(0,
0, 0, 0, 1, 1, 1, 1, 0, 0)), class = "data.frame", row.names = c(NA,
-10L))
> df
col1 col2
1 a 0
2 c 0
3 a 0
4 b 0
5 a 1
6 c 1
7 c 1
8 b 1
9 b 0
10 c 0

Related

create new column using differences of rows

I have a dataset as below.
How can I create a new column B using the difference of values in A with matching ID. Apologies if this has been asked before. Thanks
Using dplyr, we can group_by ID and subtract first and last values of A.
library(dplyr)
df %>%
group_by(ID) %>%
summarise(B = first(A) - last(A), A = first(A)) %>%
select(names(df), B)
# A tibble: 4 x 3
# ID A B
# <fct> <dbl> <dbl>
#1 aa 2 -1
#2 bb 4 0
#3 cc 3 1
#4 dd 1 0
data
df <- structure(list(ID = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L
), .Label = c("aa", "bb", "cc", "dd"), class = "factor"), A = c(2,
4, 3, 1, 3, 4, 2, 1)), class = "data.frame", row.names = c(NA, -8L))
We can use data.table methods
library(data.table)
setDT(df)[, .(B = first(A) - last(A), A = first(A)), .(ID)]
data
df <- structure(list(ID = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L
), .Label = c("aa", "bb", "cc", "dd"), class = "factor"), A = c(2,
4, 3, 1, 3, 4, 2, 1)), class = "data.frame", row.names = c(NA, -8L))
Another approach could be to pivot the table so that the two 'A' values are in separate columns.
library(tidyverse)
df %>%
mutate(name = if_else(duplicated(ID), "A_additional", "A")) %>%
pivot_wider(id_cols = ID, values_from = A, names_from = name) %>%
mutate(B = A - A_additional)
# # A tibble: 4 x 4
# ID A A_additional B
# <fct> <dbl> <dbl> <dbl>
# 1 aa 2 3 -1
# 2 bb 4 4 0
# 3 cc 3 2 1
# 4 dd 1 1 0
This solution doesn't require grouping, so should scale well to larger data sets.

R increment by 1 for every change in value column and restart the counter

I would like to find a way to do very similar to this question.
Increment by 1 for every change in column
But i want to restart the counter when var1 = c
using
df$var2 <- with(rle(as.character(df$var1)), rep(seq_along(values), lengths))*
results in column var 2
var1 var2 Should be
a 1 1
a 1 1
1 2 2
0 3 3
b 4 4
b 4 4
b 4 4
c 5 1
1 6 2
1 6 2
In data.table you can use rleid to get a run-length-id for var1 within each group.
library(data.table)
setDT(df)
df[, var2 := rleid(var1), by = cumsum(var1 == "c")]
df
# var1 var2
# 1: a 1
# 2: a 1
# 3: 1 2
# 4: 0 3
# 5: b 4
# 6: b 4
# 7: b 4
# 8: c 1
# 9: 1 2
#10: 1 2
and using dplyr
library(dplyr)
df %>%
group_by(group = cumsum(var1 == "c")) %>%
mutate(var2 = cumsum(var1 != lag(var1, default = first(var1))) + 1)
data
df <- structure(list(var1 = structure(c(3L, 3L, 2L, 1L, 4L, 4L, 4L,
5L, 2L, 2L), .Label = c("0", "1", "a", "b", "c"), class = "factor")),
class = "data.frame", row.names = c(NA, -10L))
We can use the OP's code with rle in base R with ave
df$var2 <- with(df, as.integer(ave(as.character(var1), cumsum(var1 == 'c'),
FUN = function(x) with(rle(x), rep(seq_along(values), lengths)))))
df$var2
#[1] 1 1 2 3 4 4 4 1 2 2
data
df <- structure(list(var1 = structure(c(3L, 3L, 2L, 1L, 4L, 4L, 4L,
5L, 2L, 2L), .Label = c("0", "1", "a", "b", "c"), class = "factor")),
class = "data.frame", row.names = c(NA,
-10L))

R Data Frame remove rows with max values from all columns

Hello I have the data frame and I need to remove all the rows with max values from each columns.
Example
A B C
1 2 3 5
2 4 1 1
3 1 4 3
4 2 1 1
So the output is:
A B C
4 2 1 1
Is there any quick way to do this?
We can do this with %in%
df1[!seq_len(nrow(df1)) %in% sapply(df1, which.max),]
# A B C
#4 2 1 1
If there are ties for maximum values in each row, then do
df1[!Reduce(`|`, lapply(df1, function(x) x== max(x))),]
df[-sapply(df, which.max),]
# A B C
#4 2 1 1
DATA
df = structure(list(A = c(2L, 4L, 1L, 2L), B = c(3L, 1L, 4L, 1L),
C = c(5L, 1L, 3L, 1L)), .Names = c("A", "B", "C"),
class = "data.frame", row.names = c(NA,-4L))

Replace 0 when first observation for a level factor R

I have this sample:
data <- structure(list(mmsi = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L), .Label = c("a", "b"), class = "factor"),
tr = c(1, 1, 1, 0, 2, 2, 0, 4, 4, 0, 5, 5)), .Names = c("mmsi",
"tr"), row.names = c(NA, -12L), class = "data.frame")
I want to replace each 0 in the column tr with the previous value of tr, for each mmsi.
This function works well on the sample:
for ( i in levels(data$mmsi) ) {
data$test <- na.locf(with(data, { is.na(tr) <- tr == 0; tr }), fromLast = FALSE)}
But when I play with a bigger sample, one issue apears: if the first value is 0, then I have an error (because it can not find the previous value...).
For example if I edit the small sample with
data <- structure(list(mmsi = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L), .Label = c("a", "b"), class = "factor"),
tr = c(0, 1, 1, 0, 2, 2, 0, 4, 4, 0, 5, 5)), .Names = c("mmsi",
"tr"), row.names = c(NA, -12L), class = "data.frame")
The column tr begins here with 0 instead of 1 in the previous sample. If I apply the same function for ( i in levels(data$mmsi) ) {
data$test <- na.locf(with(data, { is.na(tr) <- tr == 0; tr }), fromLast = FALSE)} then I have of course the error
Error in `$<-.data.frame`(`*tmp*`, "test", value = c(1, 1, 1, 2, 2, 2, :
replacement has 11 rows, data has 12
--> the function could not replace the value I changes (the first value in the column tr)
I guess I need in my function one more row to edit first the 0 when they occur as a first level in tr. The new row should to replace the 0 with the following non-zero value. Then the rest of the function is fine.
The output I am looking for this new column is:
data$test
[1] 1 1 1 1 2 2 2 4 4 4 5 5
Any idea how to get this?
We can do this with one of the group by functions. Convert the 'data.frame' to 'data.table' (setDT(data)), grouped by 'mmsi', apply the na.locf (from zoo) after replacing the '0' values to 'NA' and with the option na.rm = FALSE, then we do a second na.locf with fromLast = TRUE to replace the starting 0 (aka NA) to the next value.
library(data.table)
library(zoo)
setDT(data)[, test := na.locf(na.locf(replace(tr, tr==0, NA),
na.rm=FALSE), fromLast=TRUE), by = mmsi]
data
# mmsi tr test
# 1: a 0 1
# 2: a 1 1
# 3: a 1 1
# 4: a 0 1
# 5: a 2 2
# 6: a 2 2
# 7: a 0 2
# 8: b 4 4
# 9: b 4 4
#10: b 0 4
#11: b 5 5
#12: b 5 5
We could also do this without using the na.locf
setDT(data)[, test := pmax(pmax(tr, shift((NA^!tr) * tr), na.rm = TRUE),1), mmsi]

Finding the max number of occurrences from the available result

I have a dataframe which looks like -
Id Result
A 1
B 2
C 1
B 1
C 1
A 2
B 1
B 2
C 1
A 1
B 2
Now I need to calculate how many 1's and 2's are there for each Id and then select the number whose frequency of occurrence is the greatest.
Id Result
A 1
B 2
C 1
How can I do that? I have tried using the table function in some way but not able to use it effectively. Any help would be appreciated.
Here you can use aggregate in one step:
df <- structure(list(Id = structure(c(1L, 2L, 3L, 2L, 3L, 1L, 2L, 2L,
3L, 1L, 2L), .Label = c("A", "B", "C"), class = "factor"),
Result = c(1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L)),
.Names = c("Id", "Result"), class = "data.frame", row.names = c(NA, -11L)
)
res <- aggregate(Result ~ Id, df, FUN=function(x){which.max(c(sum(x==1), sum(x==2)))})
res
Result:
Id Result
1 A 1
2 B 2
3 C 1
With data.table you can try (df is your data.frame):
require(data.table)
dt<-as.data.table(df)
dt[,list(times=.N),by=list(Id,Result)][,list(Result=Result[which.max(times)]),by=Id]
# Id Result
#1: A 1
#2: B 2
#3: C 1
Using dplyr, you can try
library(dplyr)
df %>% group_by(Id, Result) %>% summarize(n = n()) %>% group_by(Id) %>%
filter(n == max(n)) %>% summarize(Result = Result)
Id Result
1 A 1
2 B 2
3 C 1
An option using table and ave
subset(as.data.frame(table(df1)),ave(Freq, Id, FUN=max)==Freq, select=-3)
# Id Result
# 1 A 1
# 3 C 1
# 5 B 2

Resources