I am converting old base R code into tidyverse and could use some help. I want to reverse code some vars in df1 conditional on the variable being tagged as positive==1 in a lookup table df2. Here's my base R solution:
library(tidyverse)
set.seed(1)
df1 <- data.frame(item1 = sample(1:4, 10, replace = TRUE),
item2 = sample(1:4, 10, replace = TRUE),
item3 = sample(1:4, 10, replace = TRUE))
df1
# item1 item2 item3
# 1 2 1 4
# 2 2 1 1
# 3 3 3 3
# 4 4 2 1
# 5 1 4 2
# 6 4 2 2
# 7 4 3 1
# 8 3 4 2
# 9 3 2 4
# 10 1 4 2
df2 <- data.frame(name = c("item1", "item2"),
positive = c(1, 0))
# name positive
# 1 item1 1
# 2 item2 0
vars <- c("item1", "item2")
# reverse code if positive==1
# 4=1, 3=2, 2=3, 1=4
for (i in vars) {
if (df2$positive[df2$name==i]==1) {
df1[i] <- 4 - df1[, i] + 1 # should reverse code item1
}
}
df1
# item1 item2 item3
# 1 3 1 4
# 2 3 1 1
# 3 2 3 3
# 4 1 2 1
# 5 4 4 2
# 6 1 2 2
# 7 1 3 1
# 8 2 4 2
# 9 2 2 4
# 10 4 4 2
We can use mutate_at where we specify the vars by subsetting the 'name' column based on the binary values of 'positive' converted to logical and subtract 4 from the column
library(dplyr)
dfn <- df1 %>%
mutate_at(vars(intersect(names(.),
as.character(df2$name)[as.logical(df2$positive)])), ~ 4 - . + 1)
dfn
# item1 item2 item3
#1 3 1 4
#2 3 1 1
#3 2 3 3
#4 1 2 1
#5 4 4 2
#6 1 2 2
#7 1 3 1
#8 2 4 2
#9 2 2 4
#10 4 4 2
Or with base R
vars1 <- with(df2, as.character(name[as.logical(positive)]))
df1[vars1] <- lapply(df1[vars1], function(x) 4 - x + 1)
data
df1 <- structure(list(item1 = c(2L, 2L, 3L, 4L, 1L, 4L, 4L, 3L, 3L,
1L), item2 = c(1L, 1L, 3L, 2L, 4L, 2L, 3L, 4L, 2L, 4L), item3 = c(4L,
1L, 3L, 1L, 2L, 2L, 1L, 2L, 4L, 2L)), class = "data.frame",
row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"))
Related
ID
1
1
2
3
3
3
3
I want to create an additional column with data table that count the unique 1s, 2s, 3s, etc and sums them up. The final dat.table would be
ID
CountID
1
2
1
2
2
1
3
4
3
4
3
4
3
4
I'm trying this but does not work:
df[, CountID := uniqueN(df, by = ID)]
Using dplyr package
df1 = group_by(df, id) %>% count()
merge(df, df1)
id n
1 1 3
2 1 3
3 1 3
4 2 1
5 3 4
6 3 4
7 3 4
8 3 4
9 4 2
10 4 2
Data
df = data.frame('id' = c( 1 , 1 , 1, 2, 3, 3, 3, 3, 4, 4))
data.table
You can use .N for this:
library(data.table)
DT[, CountID := .N, by = ID]
DT
# ID CountID
# <int> <int>
# 1: 1 2
# 2: 1 2
# 3: 2 1
# 4: 3 4
# 5: 3 4
# 6: 3 4
# 7: 3 4
base R
DT$CountID2 <- ave(rep(1L, nrow(DT)), DT$ID, FUN = length)
Data
DT <- setDT(structure(list(ID = c(1L, 1L, 2L, 3L, 3L, 3L, 3L), CountID = c(2L, 2L, 1L, 4L, 4L, 4L, 4L)), class = c("data.table", "data.frame"), row.names = c(NA, -7L)))
I have a dataframe like this:
ID S1 C
1 1 2 3
2 1 2 3
3 3 1 1
4 6 2 5
5 6 7 5
What I need is the number of rows per group ID where S1 <= C. This is the desired output.
ID Obs
1 1 2
2 3 1
3 6 1
Even though the question was answered below, I have a follow up question: Is it possible to do the same for multiple columns (S1, S2, ..). For example for the dataframe below:
ID S1 S2 C
1 1 2 2 3
2 1 2 2 3
3 3 1 1 1
4 6 2 2 5
5 6 7 7 5
And then get:
ID S1.Obs S2.Obs
1 1 2 2
2 3 1 1
3 6 1 1
A base R solution with aggregate().
aggregate(Obs ~ ID, transform(df, Obs = S1 <= C), sum)
# ID Obs
# 1 1 2
# 2 3 1
# 3 6 1
A dplyr solution
library(dplyr)
df %>%
filter(S1 <= C) %>%
count(ID, name = "Obs")
# ID Obs
# 1 1 2
# 2 3 1
# 3 6 1
Data
df <- structure(list(ID = c(1L, 1L, 3L, 6L, 6L), S1 = c(2L, 2L, 1L, 2L, 7L),
C = c(3L, 3L, 1L, 5L, 5L)), class = "data.frame", row.names = c("1", "2", "3", "4", "5"))
Extension
If you want to apply this rule on multiple columns such as S1, S2, S3:
df %>%
group_by(ID) %>%
summarise(across(starts_with("S"), ~ sum(.x <= C)))
data <- data.frame(
ID = c(1, 1, 3, 6, 6),
S1 = c(2, 2, 1, 2, 7),
C = c(3, 3, 1, 5, 5)
)
library(dplyr)
data.filtered <- data[data$S1 <= data$C,]
data.filtered %>% group_by(ID) %>%
summarize(Obs = length(ID))
An option with data.table
library(data.table)
setDT(df)[S1 <=C, .(Obs = .N), ID]
# ID Obs
#1: 1 2
#2: 3 1
#3: 6 1
data
df <- structure(list(ID = c(1L, 1L, 3L, 6L, 6L), S1 = c(2L, 2L, 1L, 2L, 7L),
C = c(3L, 3L, 1L, 5L, 5L)), class = "data.frame", row.names = c("1", "2", "3", "4", "5"))
suppose I want to find duplicate rows for columns:
cols<-c("col1", "col2")
I know for data f4 duplicate rows are:
Jo<-df4[duplicated(df4[cols]) | duplicated(df4[cols], fromLast = TRUE), ]
and removing these duplicate rows from data set is given:
No<-df4[!(duplicated(df4[cols]) | duplicated(df4[cols], fromLast = TRUE)), ]
I want to modify the above codes. Suppose there is a column called mode. It takes integers between 1 to 4. I don't want all of duplicate rows have the same mode==2.
example
col1 col2 mode
1 3 5
5 3 9
1 2 1
1 2 1
3 2 2
3 2 2
4 1 3
4 1 2
4 1 2
output
Jo:
col1 col2 mode
1 2 1
1 2 1
4 1 3
4 1 2
4 1 2
No:
col1 col2 mode
1 3 5
5 3 9
3 2 2
3 2 2
in the above example in 3 and 4-th rows since mode==2 for both it is not duplicate but for three last row since one of them is not 2 , the are duplicate
Based on the updated dataset,
library(dplyr)
out1 <- df2 %>%
group_by_at(vars(cols)) %>%
filter(n() > 1, !all(mode ==2))
out2 <- anti_join(df2, out1)
out1
# A tibble: 5 x 3
# Groups: col1, col2 [2]
# col1 col2 mode
# <int> <int> <int>
#1 1 2 1
#2 1 2 1
#3 4 1 3
#4 4 1 2
#5 4 1 2
out2
# col1 col2 mode
#1 1 3 5
#2 5 3 9
#3 3 2 2
#4 3 2 2
Or with data.table
library(data.table)
i1 <- setDT(df2)[ , .I[.N > 1 & !all(mode == 2)], by = cols]$V1
df2[i1]
# col1 col2 mode
#1: 1 2 1
#2: 1 2 1
#3: 4 1 3
#4: 4 1 2
#5: 4 1 2
df2[!i1]
# col1 col2 mode
#1: 1 3 5
#2: 5 3 9
#3: 3 2 2
#4: 3 2 2
Or using base R
i1 <- duplicated(df2[1:2])|duplicated(df2[1:2], fromLast = TRUE)
out11 <- df2[i1 & with(df2, !ave(mode==2, col1, col2, FUN = all)),]
out22 <- df2[setdiff(row.names(df2), row.names(out11)),]
data
df2 <- structure(list(col1 = c(1L, 5L, 1L, 1L, 3L, 3L, 4L, 4L, 4L),
col2 = c(3L, 3L, 2L, 2L, 2L, 2L, 1L, 1L, 1L), mode = c(5L,
9L, 1L, 1L, 2L, 2L, 3L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-9L))
ID Number Var
1 2 6
1 2 7
1 1 8
1 2 9
1 2 10
2 2 3
2 2 4
2 1 5
2 2 6
Each person has several records.
There is only one record of a person whose Number is 1, the rest is 2.
The variable Var has different values for the same person.
When the Number equals to 1, the corresponding Var (we call it P) is different for different persons.
Now, I want to delete the rows whose Var > P for every person.
At the end, I want this
ID Number Var
1 2 6
1 2 7
1 1 8
2 2 3
2 2 4
2 1 5
You can use dplyr::first where Num==1 to get the first Var value
library(dplyr)
df %>% group_by(ID) %>% mutate(Flag=first(Var[Number==1])) %>%
filter(Var <= Flag) %>% select(-Flag)
#short version and you sure there is a one Num==1
df %>% group_by(ID) %>% filter(Var <= Var[Number==1])
Here is a solution with data.table:
library(data.table)
dt <- fread(
"ID Number Var
1 2 6
1 2 7
1 1 8
1 2 9
1 2 10
2 2 3
2 2 4
2 1 5
2 2 6")
dt[, .SD[Var <= Var[Number==1]], ID]
# ID Number Var
# 1: 1 2 6
# 2: 1 2 7
# 3: 1 1 8
# 4: 2 2 3
# 5: 2 2 4
# 6: 2 1 5
A base R option would be
df1[with(df1, Var <= ave(Var * (Number == 1), ID, FUN = function(x) x[x!=0])),]
# ID Number Var
#1 1 2 6
#2 1 2 7
#3 1 1 8
#6 2 2 3
#7 2 2 4
#8 2 1 5
data
df1 <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), Number = c(2L,
2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L), Var = c(6L, 7L, 8L, 9L, 10L,
3L, 4L, 5L, 6L)), row.names = c(NA, -9L), class = "data.frame")
Searched but haven't seen where this has been handled.I have a pairwise computation data frame of absolute differences between sites from a project and the data is like this
x y value
1 2 1 5
2 3 1 4
3 4 1 6
4 5 1 3
5 3 2 5
6 4 2 7
7 5 2 3
8 4 3 2
9 5 3 5
10 5 4 7
where x and y are paired sites and value is the difference. I would like to get the results of mean for each site displayed separately. Eg. site mean of all site 5 pairs (5|3, 5|4, 5|1, 5|2) = 4.5 so that my results will be like below:
site avg
1 4.5
2 5
3 4
4 5.5
5 4.5
Whose got the solution?
Here is another option with tidyverse
library(tidyverse)
df %>%
select(x, y) %>%
unlist %>%
unique %>%
sort %>%
tibble(site = .) %>%
mutate(avg = map_dbl(site, ~
df %>%
filter_at(vars(x, y), any_vars(. == .x)) %>%
summarise(value = mean(value)) %>%
pull(value)))
# A tibble: 5 x 2
# site avg
# <int> <dbl>
#1 1 4.5
#2 2 5
#3 3 4
#4 4 5.5
#5 5 4.5
data
df <- structure(list(x = c(2L, 3L, 4L, 5L, 3L, 4L, 5L, 4L, 5L, 5L),
y = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 4L), value = c(5L,
4L, 6L, 3L, 5L, 7L, 3L, 2L, 5L, 7L)), .Names = c("x", "y",
"value"), class = "data.frame",
row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10"))
A solution using dplyr and mapply.
library(dplyr)
data.frame(site = unique(c(df$x, df$y))) %>%
mutate(mean = mapply(function(v)mean(df$value[df$x==v | df$y==v]), .$site)) %>%
arrange(site)
# site mean
# 1 1 4.5
# 2 2 5.0
# 3 3 4.0
# 4 4 5.5
# 5 5 4.5
Data:
df <- read.table(text =
" x y value
1 2 1 5
2 3 1 4
3 4 1 6
4 5 1 3
5 3 2 5
6 4 2 7
7 5 2 3
8 4 3 2
9 5 3 5
10 5 4 7",
header = TRUE, stringsAsFactors = FALSE)
If we name your original data example as df:
df$site_pair <- paste(df$x, df$y, sep = "-")
all_sites <- unique(c(df$x, df$y))
site_get_mean <- function(site_name) {
yes <- grepl(site_name, df$site_pair)
mean(df$value[yes])
}
df.new <- data.frame(site = all_sites,
avg = sapply(all_sites, site_get_mean))
Result: (edited to order by site name)
> df.new[order(df.new$site), ]
site avg
5 1 4.5
1 2 5.0
2 3 4.0
3 4 5.5
4 5 4.5