Suppose that we have the following data frame:
ID <- c(1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 5)
age <- c(25, 25, 25, 22, 22, 56, 56, 56, 80, 33, 33, 90, 90, 90)
gender <- c("m", "m", "m", "f", "f", "m", "m", "m", "m", "m", "m", "f", "f", "m")
company <- c("c1", "c2", "c2", "c3", "c3", "c1", "c1", "c1", "c1", "c5", "c5", "c3", "c4", "c5")
income <- c(1000, 1000, 1000, 500, 1700, 200, 200, 250, 500, 700, 700, 300, 350, 300)
df <- data.frame(ID, age, gender, company, income)
I need to find the row that have different values by ID for age, gender, and income. I don't care about the company whether they are same or different.
So after processing, here is the output:
BONUS,
Can we create another data frame include the list of variables that are different by id. For example:
An option would be to group by 'ID', check whether the number of distinct elements in 'age', 'gender', 'income' is equal to 1 and then negate (!)
library(dplyr)
out <- df %>%
group_by(ID) %>%
filter(!(n_distinct(age) == 1 &
n_distinct(gender) == 1 &
n_distinct(income) == 1))
out
# A tibble: 9 x 5
# Groups: ID [3]
# ID age gender company income
# <dbl> <dbl> <fct> <fct> <dbl>
#1 2 22 f c3 500
#2 2 22 f c3 1700
#3 3 56 m c1 200
#4 3 56 m c1 200
#5 3 56 m c1 250
#6 3 80 m c1 500
#7 5 90 f c3 300
#8 5 90 f c4 350
#9 5 90 m c5 300
If there are many variable, another option i filter_at
df %>%
group_by(ID) %>%
filter_at(vars(age, gender, income), any_vars(!(n_distinct(.) == 1)))
From the above, we can get the ssecond output with
library(tidyr)
out %>%
select(-company) %>%
gather(key, val, - ID) %>%
group_by(key, add = TRUE) %>%
filter(n_distinct(val) > 1) %>%
group_by(ID) %>%
summarise(Different = toString(unique(key)))
# A tibble: 3 x 2
# ID Different
# <dbl> <chr>
#1 2 income
#2 3 age, income
#3 5 gender, income
In base R, we can split c("age", "gender", "income") column based on ID find out ID's which have more than 1 unique row and subset them.
df[df$ID %in% unique(df$ID)[sapply(split(df[c("age", "gender", "income")], df$ID),
function(x) nrow(unique(x)) > 1)], ]
# ID age gender company income
#4 2 22 f c3 500
#5 2 22 f c3 1700
#6 3 56 m c1 200
#7 3 56 m c1 200
#8 3 56 m c1 250
#9 3 80 m c1 500
#12 5 90 f c3 300
#13 5 90 f c4 350
#14 5 90 m c5 300
Related
I have the following data:
names <- c("a", "b", "c", "d")
scores <- c(95, 55, 100, 60)
df <- cbind.data.frame(names, scores)
I want to "extend" this data frame to make name pairs for every possible combination of names without repetition like so:
names_1 <- c("a", "a", "a", "b", "b", "c")
names_2 <- c("b", "c", "d", "c", "d", "d")
scores_1 <- c(95, 95, 95, 55, 55, 100)
scores_2 <- c(55, 100, 60, 100, 60, 60)
df_extended <- cbind.data.frame(names_1, names_2, scores_1, scores_2)
In the extended data, scores_1 are the scores for the corresponding name in names_1, and scores_2 are for names_2.
The following bit of code makes the appropriate name pairs. But I do not know how to get the scores in the right place after that.
t(combn(df$names,2))
The final goal is to get the row-wise difference between scores_1 and scores_2.
df_extended$score_diff <- abs(df_extended$scores_1 - df_extended$scores_2)
df_ext <- data.frame(t(combn(df$names, 2,\(x)c(x, df$scores[df$names %in%x]))))
df_ext <- setNames(type.convert(df_ext, as.is =TRUE), c('name_1','name_2', 'type_1', 'type_2'))
df_ext
name_1 name_2 type_1 type_2
1 a b 95 55
2 a c 95 100
3 a d 95 60
4 b c 55 100
5 b d 55 60
6 c d 100 60
names <- c("a", "b", "c", "d")
scores <- c(95, 55, 100, 60)
df <- cbind.data.frame(names, scores)
library(tidyverse)
map(df, ~combn(x = .x, m = 2)%>% t %>% as_tibble) %>%
imap_dfc(~set_names(x = .x, nm = paste(.y, seq(ncol(.x)), sep = "_"))) %>%
mutate(score_diff = scores_1 - scores_2)
#> # A tibble: 6 × 5
#> names_1 names_2 scores_1 scores_2 score_diff
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 a b 95 55 40
#> 2 a c 95 100 -5
#> 3 a d 95 60 35
#> 4 b c 55 100 -45
#> 5 b d 55 60 -5
#> 6 c d 100 60 40
Created on 2022-06-06 by the reprex package (v2.0.1)
First, we can create a new data frame with the unique combinations of names. Then, we can merge on the scores to match the names for both names_1 and names_2 to get the final data.frame.
names <- c("a", "b", "c", "d")
scores <- c(95, 55, 100, 60)
df <- cbind.data.frame(names, scores)
new_df <- data.frame(t(combn(df$names,2)))
names(new_df)[1] <- "names_1"; names(new_df)[2] <- "names_2"
new_df <- merge(new_df, df, by.x = 'names_1', by.y = 'names')
new_df <- merge(new_df, df, by.x = 'names_2', by.y = 'names')
names(new_df)[3] <- "scores_1"; names(new_df)[4] <- "scores_2"
> new_df
names_2 names_1 scores_1 scores_2
1 b a 95 55
2 c a 95 100
3 c b 55 100
4 d a 95 60
5 d b 55 60
6 d c 100 60
I am stuck in performing pivot_longer() over multiple sets of columns. Here is the sample dataset
df <- data.frame(
id = c(1, 2),
uid = c("m1", "m2"),
germ_kg = c(23, 24),
mineral_kg = c(12, 17),
perc_germ = c(45, 34),
perc_mineral = c(78, 10))
I need the output dataframe to look like this
out <- df <- data.frame(
id = c(1, 1, 2, 2),
uid = c("m1", "m1", "m2", "m2"),
crop = c("germ", "germ", "mineral", "mineral"),
kg = c(23, 12, 24, 17),
perc = c(45, 78, 34, 10))
df %>%
rename_with(~str_replace(.x,'(.*)_kg', 'kg_\\1')) %>%
pivot_longer(-c(id, uid), names_to = c('.value', 'crop'), names_sep = '_')
# A tibble: 4 x 5
id uid crop kg perc
<dbl> <chr> <chr> <dbl> <dbl>
1 1 m1 germ 23 45
2 1 m1 mineral 12 78
3 2 m2 germ 24 34
4 2 m2 mineral 17 10
If you were to use data.table:
library(data.table)
melt(setDT(df), c('id', 'uid'), patterns(kg = 'kg', perc = 'perc'))
id uid variable kg perc
1: 1 m1 1 23 45
2: 2 m2 1 24 34
3: 1 m1 2 12 78
4: 2 m2 2 17 10
I suspect there might be a simpler way using pivot_long_spec, but one tricky thing here is that your column names don't have a consistent ordering of their semantic components. #Onyambu's answer deals with this nicely by fixing it upsteam.
library(tidyverse)
df %>%
pivot_longer(-c(id, uid)) %>%
separate(name, c("col1", "col2")) %>% # only needed
mutate(crop = if_else(col2 == "kg", col1, col2), # because name
meas = if_else(col2 == "kg", col2, col1)) %>% # structure
select(id, uid, crop, meas, value) %>% # is
pivot_wider(names_from = meas, values_from = value) # inconsistent
# A tibble: 4 x 5
id uid crop kg perc
<dbl> <chr> <chr> <dbl> <dbl>
1 1 m1 germ 23 45
2 1 m1 mineral 12 78
3 2 m2 germ 24 34
4 2 m2 mineral 17 10
I am trying to add two rows to the data frame.
Regarding the first row, its value in MODEL column should be X, total_value should be the sum of total value of rows, with the MODEL being A and C and total_frequency should be the sum of total_frequency of rows, with the MODEL being A and C.
In the second row, the value in MODEL column should be Z, total_value should be the sum of total_value of rows, with the MODEL being D, Fand E, and total_frequency should be the sum of total_frequency of rows, with the MODEL being D,Fand E.
I am stuck, as I do not know how to select specific values of MODEL and then sum these two other columns.
Here is my data
data.frame(MODEL=c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J"), total_value= c(62, 54, 78, 38, 16, 75, 39, 13, 58, 37),
total_frequency = c(78, 83, 24, 13, 22, 52, 16, 16, 20, 72))
You can try with dplyr, calculating the "new rows", then put together with the data df:
library(dplyr)
first <- df %>%
# select the models you need
filter(MODEL %in% c("A","C")) %>%
# call them x
mutate(MODEL = 'X') %>%
# grouping
group_by(MODEL) %>%
# calculate the sums
summarise_all(sum)
# same with the second
second <- df %>%
filter(MODEL %in% c("D","F","E")) %>%
mutate(MODEL = 'Z') %>%
group_by(MODEL) %>% summarise_all(sum)
# put together
rbind(df, first, second)
# A tibble: 12 x 3
MODEL total_value total_frequency
1 A 62 78
2 B 54 83
3 C 78 24
4 D 38 13
5 E 16 22
6 F 75 52
7 G 39 16
8 H 13 16
9 I 58 20
10 J 37 72
11 X 140 102
12 Z 129 87
The following code is a straightforward solution to the problem.
i1 <- df1$MODEL %in% c("A", "C")
total_value <- sum(df1$total_value[i1])
total_frequency <- sum(df1$total_frequency[i1])
df1 <- rbind(df1, data.frame(MODEL = "X", total_value, total_frequency))
i2 <- df1$MODEL %in% c("D", "E", "F")
total_value <- sum(df1$total_value[i2])
total_frequency <- sum(df1$total_frequency[i2])
df1 <- rbind(df1, data.frame(MODEL = "Z", total_value, total_frequency))
df1
# MODEL total_value total_frequency
#1 A 62 78
#2 B 54 83
#3 C 78 24
#4 D 38 13
#5 E 16 22
#6 F 75 52
#7 G 39 16
#8 H 13 16
#9 I 58 20
#10 J 37 72
#11 X 140 102
#12 Z 129 87
It is also possible to write a function to avoid repeating the same code.
fun <- function(X, M, vals){
i1 <- X$MODEL %in% vals
total_value <- sum(X$total_value[i1])
total_frequency <- sum(X$total_frequency[i1])
rbind(X, data.frame(MODEL = M, total_value, total_frequency))
}
df1 <- fun(df1, M = "X", vals = c("A", "C"))
df1 <- fun(df1, M = "Z", vals = c("D", "E", "F"))
This question already has an answer here:
Forward and backward fill data frame in R [duplicate]
(1 answer)
Closed 3 years ago.
Suppose that we have the following data frame:
ID <- c(1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6)
age <- c(25, 25, 25, 22, 22, 56, 56, 56, 80, 33, 33, 90, 90, 90, 5, 5, 5)
gender <- c("m", "m", NA, "f", "f", "m", NA, "m", "m", "m", NA, NA, NA, "m", NA, NA, NA)
company <- c("c1", "c2", "c2", "c3", "c3", "c1", "c1", "c1", "c1", "c5", "c5", "c3", "c4", "c5", "c3", "c1", "c1")
income <- c(1000, 1000, 1000, 500, 1700, 200, 200, 250, 500, 700, 700, 300, 350, 300, 500, 1700, 200)
df <- data.frame(ID, age, gender, company, income)
In this data we have 6 unique IDs, and if you look at the gender variable, sometimes in includes NA
I want to replace the NAs with the correct gender category. Also, in case an ID has all NA's for gender, then leave it as is.
The expected outcome would be:
Here's way in base R using ave -
df$gender <- with(df, ave(gender, ID, FUN = function(x) na.omit(x)[1]))
ID age gender company income
1 1 25 m c1 1000
2 1 25 m c2 1000
3 1 25 m c2 1000
4 2 22 f c3 500
5 2 22 f c3 1700
6 3 56 m c1 200
7 3 56 m c1 200
8 3 56 m c1 250
9 3 80 m c1 500
10 4 33 m c5 700
11 4 33 m c5 700
12 5 90 m c3 300
13 5 90 m c4 350
14 5 90 m c5 300
15 6 5 <NA> c3 500
16 6 5 <NA> c1 1700
17 6 5 <NA> c1 200
Some ways with dplyr and tidyr -
df %>%
group_by(ID) %>%
mutate(gender = na.omit(gender)[1])
df %>%
group_by(ID) %>%
fill(gender, .direction = "up") %>%
fill(gender, .direction = "down")
Using the tidyverse library you can do this
library(tidyverse)
# for each ID get the gender
df_gender_ref <- df %>% filter(!is.na(gender)) %>% select(ID,gender) %>% unique()
# add the new gender column to the original dataframe
df %>% select(-gender) %>% left_join(df_gender_ref)
I have a data frame, and I'd like to create a new column that gives the sum of a numeric variable grouped by factors. So something like this:
BEFORE:
data1 <- data.frame(month = c(1, 1, 2, 2, 3, 3),
sex = c("m", "f", "m", "f", "m", "f"),
value = c(10, 20, 30, 40, 50, 60))
AFTER:
data2 <- data.frame(month = c(1, 1, 2, 2, 3, 3),
sex = c("m", "f", "m", "f", "m", "f"),
value = c(10, 20, 30, 40, 50, 60),
sum = c(30, 30, 70, 70, 110, 110))
In Stata you can do this with the egen command quite easily. I've tried the aggregate function, and the ddply function but they create entirely new data frames, and I just want to add a column to the existing one.
You are looking for ave
> data2 <- transform(data1, sum=ave(value, month, FUN=sum))
month sex value sum
1 1 m 10 30
2 1 f 20 30
3 2 m 30 70
4 2 f 40 70
5 3 m 50 110
6 3 f 60 110
data1$sum <- ave(data1$value, data1$month, FUN=sum) is useful if you don't want to use transform
Also data.table is helpful
library(data.table)
DT <- data.table(data1)
DT[, sum:=sum(value), by=month]
UPDATE
We can also use a tidyverse approach which is simple, yet elegant:
> library(tidyverse)
> data1 %>%
group_by(month) %>%
mutate(sum=sum(value))
# A tibble: 6 x 4
# Groups: month [3]
month sex value sum
<dbl> <fct> <dbl> <dbl>
1 1 m 10 30
2 1 f 20 30
3 2 m 30 70
4 2 f 40 70
5 3 m 50 110
6 3 f 60 110