Each day a company creates a value for category_1 and category_2.
A new company may enter the survey midway as company E appears on Dec 25.
Here are three days of data. So, two intervals: Dec 24-25 and Dec 25-26.
Question
For each category how many increase/decreases/no change were there over the 3 days?
For example, in cat1 A goes from a 2 to 1, B goes from a 3 to a 4, etc.
By hand I get:
cat1 - Up: 2, Down: 5, No change: 2
cat2 - Up: 6, Down: 2, No change: 1
How do I calculate the number of up/downs/no changes in an R Script?
library("tidyverse")
d1 <- as.Date("2022-12-24")
d2 <- as.Date("2022-12-25")
d3 <- as.Date("2022-12-26")
df <- tibble(
company = c(LETTERS[1:4], LETTERS[1:5], LETTERS[1:5]),
cat1 = c(2, 3, 4, 5, 1, 4, 5, 3, 2, 1, 4, 4, 2, 1),
cat2 = c(6, 7, 8, 9, 5, 5, 9, 10, 11, 6, 5, 10, 12, 13),
date = c(rep(d1, 4), rep(d2, 5), rep(d2, 5))
)
df
One approach using dplyr, assuming arranged data. Note: I changed the typo in date 3 to d3.
library(dplyr)
df %>%
group_by(company) %>%
mutate(cat1_change = cat1 - lag(cat1), cat2_change = cat2 - lag(cat2)) %>%
ungroup() %>%
summarize(type = c("up", "down", "no-change"),
across(ends_with("change"), ~
c(sum(.x > 0, na.rm=T), sum(.x < 0, na.rm=T), sum(.x == 0, na.rm=T))))
# A tibble: 3 × 3
type cat1_change cat2_change
<chr> <int> <int>
1 up 2 6
2 down 5 2
3 no-change 2 1
Data
df <- structure(list(company = c("A", "B", "C", "D", "A", "B", "C",
"D", "E", "A", "B", "C", "D", "E"), cat1 = c(2, 3, 4, 5, 1, 4,
5, 3, 2, 1, 4, 4, 2, 1), cat2 = c(6, 7, 8, 9, 5, 5, 9, 10, 11,
6, 5, 10, 12, 13), date = structure(c(19350, 19350, 19350, 19350,
19351, 19351, 19351, 19351, 19351, 19352, 19352, 19352, 19352,
19352), class = "Date")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -14L))
An option with data.table - grouped by company, loop over the 'cat' column, get the diff of adjacent elements, convert to sign, and rename with factor labels, melt to long format and reshape back to 'wide' format with dcast
library(data.table)
dcast(melt(setDT(df)[, lapply(.SD, \(x) factor(sign(diff(x)),
levels = c(-1, 0, 1), labels = c("down", "no-change", "up"))),
company, .SDcols = patterns("^cat")], id.var = "company",
value.name = "type"), type ~ paste0(variable, "_change"), length)
-output
type cat1_change cat2_change
1: down 5 2
2: no-change 2 1
3: up 2 6
Related
I have a list of dataframes. It looks something like this:
df1 <- data.frame(Var1 = c(1, 7, 9, 4, 2),
Var2 = c(7, 2, 4, 4, 3),
Var3 = c(3, 6, 2, 0, 8))
df2 <- data.frame(Var1 = c(5, 6, 2, 2, 1),
Var2 = c(8, 6, 6, 7, 4),
Var3 = c(9, 0, 1, 3, 4))
df3.wxyz <- data.frame(Var1 = c("w", "x", "y", "z", 3, 7, 3, 6, 6),
Var2 = c(NA, NA, NA, NA, 7, 5, 8, 0, 2),
Var3 = c(NA, NA, NA, NA, 3, 3, 4, 1, 9))
df4 <- data.frame(Var1 = c(2, 7, 2, 4, 8),
Var2 = c(8, 3, 1, 7, 3),
Var3 = c(9, 1, 1, 6, 5))
df5.wxyz <- data.frame(Var1 = c("w", "x", "y", "z", 2, 7, 3, 1, 6),
Var2 = c(NA, NA, NA, NA, 7, 4, 8, 1, 9),
Var3 = c(NA, NA, NA, NA, 8, 0, 4, 1, 2))
df.list <- list(df1, df2, df3.wxyz, df4, df5.wxyz)
names(df.list) <- c("df1", "df2", "df3.wxyz", "df4", "df5.wxyz")
I would like to remove the first 4 rows of df3.wxyz and df5.wxyz from the list of dataframes as those contain information that I do not need. What I've tried is the following code, but instead of only removing the first 4 rows in df3.wxyz and df5.wxyz, it is removing the first 4 rows from every dataframe in my list. I'm not sure what the issue is.
df.list <- lapply(df.list, function(i){
ifelse(grepl("wxyz", names(df.list)), i <- i[-c(1:4), ], df.list)
i
})
This is what I would like to achieve:
df1 <- data.frame(Var1 = c(1, 7, 9, 4, 2),
Var2 = c(7, 2, 4, 4, 3),
Var3 = c(3, 6, 2, 0, 8))
df2 <- data.frame(Var1 = c(5, 6, 2, 2, 1),
Var2 = c(8, 6, 6, 7, 4),
Var3 = c(9, 0, 1, 3, 4))
df3.wxyz <- data.frame(Var1 = c(3, 7, 3, 6, 6),
Var2 = c(7, 5, 8, 0, 2),
Var3 = c(3, 3, 4, 1, 9))
df4 <- data.frame(Var1 = c(2, 7, 2, 4, 8),
Var2 = c(8, 3, 1, 7, 3),
Var3 = c(9, 1, 1, 6, 5))
df5.wxyz <- data.frame(Var1 = c(2, 7, 3, 1, 6),
Var2 = c(7, 4, 8, 1, 9),
Var3 = c(8, 0, 4, 1, 2))
df.list <- list(df1, df2, df3.wxyz, df4, df5.wxyz)
names(df.list) <- c("df1", "df2", "df3.wxyz", "df4", "df5.wxyz")
You can try,
df.list[grepl('wxyz', names(df.list))] <- lapply(df.list[grepl('wxyz', names(df.list))], na.omit)
You can try na.omit like below
> Map(na.omit,df.list)
$df1
Var1 Var2 Var3
1 1 7 3
2 7 2 6
3 9 4 2
4 4 4 0
5 2 3 8
$df2
Var1 Var2 Var3
1 5 8 9
2 6 6 0
3 2 6 1
4 2 7 3
5 1 4 4
$df3.wxyz
Var1 Var2 Var3
5 3 7 3
6 7 5 3
7 3 8 4
8 6 0 1
9 6 2 9
$df4
Var1 Var2 Var3
1 2 8 9
2 7 3 1
3 2 1 1
4 4 7 6
5 8 3 5
$df5.wxyz
Var1 Var2 Var3
5 2 7 8
6 7 4 0
7 3 8 4
8 1 1 1
9 6 9 2
I am trying to sort my data in descending or ascending order regardless of the data in the rows. I made a dummy example below:
A <- c(9,9,5,4,6,3,2,NA)
B <- c(9,5,3,4,1,4,NA,NA)
C <- c(1,4,5,6,7,4,2,4)
base <- data.frame(A,B,C)
df <- base
df$A <- sort(df$A,na.last = T)
df$B <- sort(df$B,na.last = T)
df$C <- sort(df$C)
We get this
structure(list(A = c(2, 3, 3, 4, 4, 4, 5, 5, 6, 9, 9, NA), B = c(1,
2, 3, 4, 4, 4, 5, 5, 9, 10, NA, NA), C = c(1, 2, 3, 4, 4, 4,
5, 5, 6, 7, 8, 8)), row.names = c(NA, -12L), class = "data.frame")
I want to get something similar to df but my data have hundreds of columns, is there an easier way to do it?
I tried arrange_all() but the result is not what i want.
library(tidyverse)
test <- base%>%
arrange_all()
Obtaining this:
structure(list(A = c(2, 3, 3, 4, 4, 4, 5, 5, 6, 9, 9, NA), B = c(NA,
2, 4, 4, 5, 10, 3, 4, 1, 5, 9, NA), C = c(2, 3, 4, 6, 8, 5, 5,
8, 7, 4, 1, 4)), class = "data.frame", row.names = c(NA, -12L
))
You can sort each column individually :
library(dplyr)
base %>% mutate(across(.fns = sort, na.last = TRUE))
# A B C
#1 2 1 1
#2 3 3 2
#3 4 4 4
#4 5 4 4
#5 6 5 4
#6 9 9 5
#7 9 NA 6
#8 NA NA 7
Or in base R :
base[] <- lapply(base, sort, na.last = TRUE)
I have a cvs file that has the following structure (minimum example):
ID Variable Vector
1 a [0,0,0]
2 a [1,2,3]
1 a [1,1,2]
2 a [1,2,3]
1 b [0,0,0]
2 b [1,1,1]
1 b [0,0,1]
2 b [3,5,7]
I would like to calculate the mean vector for each combination of parameters (in this case, ID and Variable). That is, I want to obtain a dataframe like the following one:
ID Variable Vector
1 a [0.5,0.5,1]
2 a [1,2,3]
1 b [0,0,0.5]
2 b [2,3,4]
I have generated this csv file with Python, that's why I have that structure with brackets. But I do not know how to start to do this using R. It doesn't seem to be a common data structure.
Update:
Vector variable structure (obtained from dput(head(data, 8))
Vector = c("[3, 16, 14, 5, 6, 13, 17, 7, 13, 6]",
"[7, 12, 6, 10, 6, 5, 16, 9, 19, 10]", "[4, 13, 4, 11, 6, 15, 17, 10, 12, 8]",
"[18, 11, 16, 8, 10, 10, 7, 4, 9, 7]", "[9, 9, 10, 17, 8, 13, 3, 13, 8, 10]",
"[17, 12, 7, 13, 6, 13, 8, 9, 5, 10]", "[9, 6, 14, 10, 8, 4, 8, 14, 15, 12]",
"[7, 13, 8, 10, 16, 8, 13, 13, 8, 4]")), row.names = c(NA, 8L
), class = "data.frame")
Assuming the 'Vector' column is a list, after grouping by 'ID', 'Variable', we reduce the 'Vector' by adding (+) the corresponding elements together and then divide by the total number of elements (n()) in that group
library(dplyr)
library(purrr)
out <- df1 %>%
group_by(ID, Variable) %>%
summarise(Vector = list(reduce(Vector, `+`)/n()), .groups = 'drop')
-output
out
# A tibble: 4 x 3
# ID Variable Vector
# <dbl> <chr> <list>
#1 1 a <dbl [3]>
#2 1 b <dbl [3]>
#3 2 a <dbl [3]>
#4 2 b <dbl [3]>
out$Vector
#[[1]]
#[1] 0.5 0.5 1.0
#[[2]]
#[1] 0.0 0.0 0.5
#[[3]]
#[1] 1 2 3
#[[4]]
#[1] 2 3 4
If the column 'Vector' is a character string, an option is to extract the numeric part into a list
library(stringr)
out <- df1 %>%
group_by(ID, Variable) %>%
summarise(Vector = list((str_extract_all(Vector, "\\d+") %>%
map(as.numeric) %>% reduce(`+`))/n()), .groups = 'drop')
data
df1 <- structure(list(ID = c(1, 2, 1, 2, 1, 2, 1, 2), Variable = c("a",
"a", "a", "a", "b", "b", "b", "b"), Vector = structure(list(c(0,
0, 0), c(1, 2, 3), c(1, 1, 2), c(1, 2, 3), c(0, 0, 0), c(1, 1,
1), c(0, 0, 1), c(3, 5, 7)), class = "AsIs")), class = "data.frame",
row.names = c(NA,
-8L))
structure(list(a = c(NA, 3, 4, NA, 3, "Council" , "Council", 1), b = c("Council A", 3, 4,
"Council B", 6, 7, 2, 6), c = c(6, 3, 6, 5, 3, 6, 5, 3), d = c(6, 2, 4,
5, 3, 7, 2, 6), e = c(1, 2, 4, 5, 6, 7, 6, 3), f = c(2, 3, 4,
2, 2, 7, 5, 2)), .Names = c("a", "b", "c", "d", "e", "f"), row.names = c(NA,
8L), class = "data.frame")
I am trying to convert objects in a using dplyr mutuate and case_when based on text in b . I want to convert values in a to Council if b contains Council in the string.
The code i've used is DF %>% select(a, b) %>% mutate(a =case_when(grepl("Council", b) ~"Council"))
However all values become NA in a if they do not contain the string Council. I've reviewed other posts and attempted various methods including ifelse. I want to maintain the same dataframe just make any NA values in a be converted to Council but only in the cases where it is NA values.
From ?case_when
If no cases match, NA is returned.
So for the cases when there is no "Council" word in b it returns NA.
You need to define the TRUE argument in case_when and assign it to a to keep the values unchanged when the condition is not met.
library(dplyr)
df %>%
mutate(a = case_when(grepl("Council", b) ~"Council",
TRUE ~ a))
# a b c d e f
#1 Council Council A 6 6 1 2
#2 3 3 3 2 2 3
#3 4 4 6 4 4 4
#4 Council Council B 5 5 5 2
#5 3 6 3 3 6 2
#6 Council 7 6 7 7 7
#7 Council 2 5 2 6 5
#8 1 6 3 6 3 2
In this case you could also achieve your result using base R
df$a[grepl("Council", df$b)] <- "Council"
You can also use str_detect from the package stringr to achieve your objective.
library(dplyr)
library(stringr)
df <- structure(list(a = c(NA, 3, 4, NA, 3, "Council" , "Council", 1), b = c("Council A", 3, 4,
"Council B", 6, 7, 2, 6), c = c(6, 3, 6, 5, 3, 6, 5, 3), d = c(6, 2, 4,
5, 3, 7, 2, 6), e = c(1, 2, 4, 5, 6, 7, 6, 3), f = c(2, 3, 4,
2, 2, 7, 5, 2)), .Names = c("a", "b", "c", "d", "e", "f"), row.names = c(NA,
8L), class = "data.frame")
df %>%
mutate(a=ifelse(str_detect(b,fixed("council",ignore_case = T)) & is.na(a),"Council",a))
I am getting getting an error when I try to merge two data sets using left_join with three "key" variables. Here is a reproducible example:
df1 <- tribble(
~var1, ~var2, ~var3, ~var4,
1, 1, 1, 4,
2, 2, 2, 5,
3, 3, 3, 6
)
df2 <- tribble(
~var1, ~var2, ~var3, ~var5,
1, 1, 1, 7,
2, 2, 2, 8,
3, 3, 3, 9
)
df3<- df1 %>%
left_join(df2, by = "var1", "var2", "var3")
This gives the following error:
Error: `suffix` must be a character vector of length 2, not string of length
1
I know that join must be having an issue naming the key variables that are used for join, but I don't understand why.
Here is what I would like to have as a result:
# A tibble: 3 x 5
var1 var2 var3 var4 var5
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 4 7
2 2 2 2 5 8
3 3 3 3 6 9
Solution (thanks #user127649)
df3<- df1 %>%
left_join(df2, by = c("var1", "var2", "var3"))
I'd do this:
df1 <- tribble(
~var1, ~var2, ~var3, ~var4,
1, 1, 1, 4,
2, 2, 2, 5,
3, 3, 3, 6
)
df2 <- tribble(
~var1, ~var2, ~var3, ~var5,
1, 1, 1, 7,
2, 2, 2, 8,
3, 3, 3, 9
)
df3 <- merge(df1, df2, all.x = TRUE)