HAVE=data.frame("TEACHER"=c(1,1,1,1,1,2,2,2,2),
"STUDENT"=c(1,1,1,2,2,3,3,3,4),
"TRIMESTER"=c(1,2,3,2,3,3,4,5,4))
WANT=data.frame("TRIMESTER"=c(1,2,3,4,5),
"NEWSTUDENTS"=c(1,1,1,1,0),
"TOTALSTUDENTS"=c(1,2,3,4,4),
"NEWTEACHER"=c(1,0,1,0,0),
"TOTALTEACHER"=c(1,1,2,2,2))
I wish to convert HAVE to WANT and I wish to do it by taking the count of NEWSTUDENTS and TOTALSTUDENTS, NEWSTUDENTS is when a STUDENT appears first in the TRIMESTER and do the same for TEACHER
We may loop across the columns, create 'NEW' columns using duplicated and then do a group by sum
library(dplyr)
library(stringr)
HAVE %>%
# order by TRIMESTER
arrange(TRIMESTER) %>%
# loop across TEACHER, STUDENT, create logical columns
# by modifying the .names
mutate(across(c(TEACHER, STUDENT), ~ !duplicated(.x),
.names = "NEW{.col}")) %>%
# grouped by TRIMESTER
group_by(TRIMESTER) %>%
# get the sum of 'NEW' columns
summarise(across(starts_with("NEW"), sum), .groups = 'drop') %>%
# loop over the 'NEW' columns, get the cumulative sum
# create new columns by modifying the .names
mutate(across(starts_with('NEW'), cumsum,
.names = "{str_replace(.col, 'NEW', 'TOTAL')}"))
-output
# A tibble: 5 × 5
TRIMESTER NEWTEACHER NEWSTUDENT TOTALTEACHER TOTALSTUDENT
<dbl> <int> <int> <int> <int>
1 1 1 1 1 1
2 2 0 1 1 2
3 3 1 1 2 3
4 4 0 1 2 4
5 5 0 0 2 4
This is an extension of this other answer of mine.
suppressPackageStartupMessages(library(dplyr))
HAVE <- data.frame("TEACHER"=c(1,1,1,1,1,2,2,2,2),
"STUDENT"=c(1,1,1,2,2,3,3,3,4),
"TRIMESTER"=c(1,2,3,2,3,3,4,5,4))
WANT <- data.frame("TRIMESTER"=c(1,2,3,4,5),
"NEWSTUDENTS"=c(1,1,1,1,0),
"TOTALSTUDENTS"=c(1,2,3,4,4),
"NEWTEACHER"=c(1,0,1,0,0),
"TOTALTEACHER"=c(1,1,2,2,2))
HAVE %>%
mutate(NEWSTUDENTS = !duplicated(STUDENT),
NEWTEACHER = !duplicated(TEACHER)) %>%
group_by(TRIMESTER) %>%
summarise(NEWSTUDENTS = sum(NEWSTUDENTS),
NEWTEACHER = sum(NEWTEACHER)) %>%
ungroup() %>%
mutate(TOTALSTUDENTS = cumsum(NEWSTUDENTS),
TOTALTEACHER = cumsum(NEWTEACHER)) %>%
relocate(TOTALSTUDENTS, .before = NEWTEACHER)
#> # A tibble: 5 × 5
#> TRIMESTER NEWSTUDENTS TOTALSTUDENTS NEWTEACHER TOTALTEACHER
#> <dbl> <int> <int> <int> <int>
#> 1 1 1 1 1 1
#> 2 2 1 2 0 1
#> 3 3 1 3 1 2
#> 4 4 1 4 0 2
#> 5 5 0 4 0 2
Created on 2022-08-18 by the reprex package (v2.0.1)
Related
I have a dataset with financial data. Sometimes, a product gets refunded, resulting in a negative count of the product (so the money gets returned). I want to conditionally filter these rows out of the dataset.
Example:
library(tidyverse)
set.seed(1)
df <- tibble(
count = sample(c(-1,1),80,replace = TRUE,prob=c(.2,.8)),
id = rep(1:4,20)
)
df %>%
group_by(id) %>%
summarize(total = sum(count))
# A tibble: 4 x 2
id total
<int> <dbl>
1 1 10
2 2 14
3 3 16
4 4 10
id = 1 has 15 positive counts and 5 negatives. (15 - 5= 10). I want to keep 10 values in df with id = 1 with the positive values.
id = 2 has 17 positive counts and 3 negatives. (17- 3 = 14). I want to keep 14 values in df with id = 2 with the positive values.
In the end, this condition should be True nrow(df) == sum(df$count)
Unfortunately, a filtering join such as anti_join() will remove all the rows. For some reason I cannot think of another option to filter the tibble.
Thanks for helping me!
You can "uncount" using the total column to get the number of repeats of each row.
df %>%
group_by(id) %>%
summarize(total = sum(count)) %>%
uncount(total) %>%
mutate(count = 1)
#> # A tibble: 50 x 2
#> id count
#> <int> <dbl>
#> 1 1 1
#> 2 1 1
#> 3 1 1
#> 4 1 1
#> 5 1 1
#> 6 1 1
#> 7 1 1
#> 8 1 1
#> 9 1 1
#> 10 1 1
#> # ... with 40 more rows
Created on 2022-10-21 with reprex v2.0.2
I have conflict data that looks like this
conflict_ID country_code SideA
1 1 1
1 2 1
1 3 0
2 4 1
2 5 0
Now I want to make it into dyadic conflict data that looks like this (SideA=1 should be country_code_1):
conflict_ID country_code_1 country_code_2
1 1 3
1 2 3
2 4 5
Can anyone point me in the right direction?
Here's a direct approach:
df %>%
filter(SideA == 1) %>%
select(conflict_ID, country_code_1 = country_code) %>%
left_join(
df %>%
filter(SideA == 0) %>%
select(conflict_ID, country_code_2 = country_code),
by = "conflict_ID"
)
# conflict_ID country_code_1 country_code_2
# 1 1 1 3
# 2 1 2 3
# 3 2 4 5
Using this data:
df = read.table(text = 'conflict_ID country_code SideA
1 1 1
1 2 1
1 3 0
2 4 1
2 5 0 ', header = T)
This extends the previous issue you posted. You could produce all combinations for each conflict_ID, and filter out those combinations where country_code_2 matches country_code with SideA == 1.
library(dplyr)
library(tidyr)
mydf %>%
group_by(conflict_ID) %>%
summarise(country_code = combn(country_code, 2, sort, simplify = FALSE),
.groups = 'drop') %>%
unnest_wider(country_code, names_sep = '_') %>%
anti_join(filter(mydf, SideA == 1),
by = c("conflict_ID", "country_code_2" = "country_code"))
# # A tibble: 3 × 3
# conflict_ID country_code_1 country_code_2
# <int> <int> <int>
# 1 1 1 3
# 2 1 2 3
# 3 2 4 5
When I run the below reproducible code I get the desired grouping results in the GroupRank column shown immediately beneath:
library(dplyr)
myData <-
data.frame(
Element = c("A","A","B","A","C","C"),
Group = c(0,0,0,0,1,1)
)
myDataGroups <- myData %>%
mutate(origOrder = row_number()) %>%
group_by(Element) %>%
mutate(ElementCnt = row_number()) %>%
ungroup() %>%
mutate(Group = factor(Group, unique(Group))) %>%
arrange(Group) %>%
mutate(groupCt = cumsum(Group != lag(Group, 1, Group[[1]])) - 1L) %>%
group_by(Group) %>%
mutate(GroupRank = ElementCnt - max(0L,groupCt),
GroupRank = if_else(as.character(Group) == "0", ElementCnt, min(GroupRank))
)%>%
ungroup() %>%
arrange(origOrder)
myDataGroups
> myDataGroups
# A tibble: 6 x 6
Element Group origOrder ElementCnt groupCt GroupRank
<chr> <fct> <int> <int> <int> <int>
1 A 0 1 1 -1 1
2 A 0 2 2 -1 2
3 B 0 3 1 -1 1
4 A 0 4 3 -1 3
5 C 1 5 1 0 1
6 C 1 6 2 0 1
However when I take the line from the above code GroupRank = if_else(as.character(Group) == "0", ElementCnt, min(GroupRank)) and simply add a max function like this GroupRank = max(1L,if_else( as.character(Group) == "0", ElementCnt, min(GroupRank))) (run as 1 and 1L both ways and get the same results) I get the strange output shown below. GroupRank shouldn´t have changed from the above output:
Element Group origOrder ElementCnt groupCt GroupRank
<chr> <fct> <int> <int> <int> <int>
1 A 0 1 1 -1 3
2 A 0 2 2 -1 3
3 B 0 3 1 -1 3
4 A 0 4 3 -1 3
5 C 1 5 1 0 1
6 C 1 6 2 0 1
What am I doing wrong here? Am I using max() incorrectly?
Note the difference between max() and pmax().
max(1:5, 5:1)
#> [1] 5
pmax(1:5, 5:1)
#> [1] 5 4 3 4 5
max() returns a scalar, which is why you get a constant value per group. pmax() does what you apparently expect, which is return a rowwise maximum vector.
For the dataframe below I want to add the original values for Var_x after a group_by on ID and event and a max() on quest, but I cannot get my code right. Any suggestions? By the way, in my original dataframe more than 1 column needs to be added.
df <- data.frame(ID = c(1,1,1,1,1,1,2,2,2,3,3,3),
quest = c(1,1,2,2,3,3,1,2,3,1,2,3),
event = c("A","B","A","B","A",NA,"C","D","C","D","D",NA),
VAR_X = c(2,4,3,6,3,NA,6,4,5,7,5,NA))
Code:
df %>%
group_by(ID,event) %>%
summarise(quest = max(quest))
Desired output:
ID quest event VAR_X
1 1 2 B 6
2 1 3 A 3
3 2 2 D 4
4 2 3 C 5
5 3 2 D 5
Start by omiting the na values and in the end do an inner_join with the original data set.
df %>%
na.omit() %>%
group_by(ID, event) %>%
summarise(quest = max(quest)) %>%
inner_join(df, by = c("ID", "event", "quest"))
## A tibble: 5 x 4
## Groups: ID [3]
# ID event quest VAR_X
# <dbl> <fct> <dbl> <dbl>
#1 1 A 3 3
#2 1 B 2 6
#3 2 C 3 5
#4 2 D 2 4
#5 3 D 2 5
df %>%
drop_na() %>% # remove if necessary ..
group_by(ID, event) %>%
filter(quest == max(quest)) %>%
ungroup()
# A tibble: 5 x 4
# ID quest event VAR_X
#<dbl> <dbl> <chr> <dbl>
# 1 1 2 B 6
# 2 1 3 A 3
# 3 2 2 D 4
# 4 2 3 C 5
# 5 3 2 D 5
My df looks something like this:
ID Obs Value
1 1 26
1 2 13
1 3 52
2 1 1,5
2 2 30
Using dplyr, I to add the additional column Col, which is the result of a division of all values in the column value by the group's first value in that column.
ID Obs Value Col
1 1 26 1
1 2 13 0,5
1 3 52 2
2 1 1,5 1
2 2 30 20
How do I do that?
After grouping by 'ID', use mutate to create a new column by dividing the 'Value' by the first of 'Value'
library(dplyr)
df1 %>%
group_by(ID) %>%
mutate(Col = Value/first(Value))
If the first 'Value' is 0 and we don't want to use it, then subset the 'Value' with a logical expression and then take the first of that
df1 %>%
group_by(ID) %>%
mutate(Col = Value/first(Value[Value != 0]))
Or in base R
df1$Col <- with(df1, Value/ave(Value, ID, FUN = head, 1))
NOTE: The comma in 'Value' suggests it is a character column. In that case, it should be first changed to decimal (.) if that is the case, convert to nunmeric and then do the division. It can be done while reading the data
Or, without creating an additional column:
library(tidyverse)
df = data.frame(ID=c(1,1,1,2,2), Obs=c(1,2,3,1,2), Value=c(26, 13, 52, 1.5, 30))
df %>%
group_by(ID) %>%
mutate_at('Value', ~./first(.))
#> # A tibble: 5 x 3
#> # Groups: ID [2]
#> ID Obs Value
#> <dbl> <dbl> <dbl>
#> 1 1 1 1
#> 2 1 2 0.5
#> 3 1 3 2
#> 4 2 1 1
#> 5 2 2 20
### OR ###
df %>%
group_by(ID) %>%
mutate_at('Value', function(x) x/first(x))
#> # A tibble: 5 x 3
#> # Groups: ID [2]
#> ID Obs Value
#> <dbl> <dbl> <dbl>
#> 1 1 1 1
#> 2 1 2 0.5
#> 3 1 3 2
#> 4 2 1 1
#> 5 2 2 20
Created on 2020-01-04 by the reprex package (v0.3.0)