I have a list named df which contains three iterations with two years of projection.
What I want is: weighting the variable "district" just for year 2 in each iteration and finally I want to have mean of each weighted district for all three iterations. Note that each year has a variable named "weight" that weighting should be based on this variable.
iteration1 <- list(year1 = data.frame(age = c(10, 11, 12, 13),
district = c(1, 2, 3, 4),
gender = c(1, 2, 2, 1),
weight = c(12.2, 11.3, 11.2, 10.1)),
year2 = data.frame(age = c(10, 11, 12, 13, 10, 10),
district = c(1, 2, 3, 4, 2, 1),
gender = c(1, 2, 2, 1, 1, 1),
weight = c(12.2, 11.3, 11.2, 10.1, 12.2, 13.1)))
iteration2 <- list(year1 = data.frame(age = c(10, 11, 12, 13),
district = c(1, 2, 3, 4),
gender = c(2, 2, 1, 1),
weight = c(12.2, 11.3, 11.2, 10.1)),
year2 = data.frame(age = c(10, 11, 12, 13, 13, 13, 12),
district = c(1, 2, 3, 4, 1, 3, 3),
gender = c(2, 2, 1, 1, 2, 2, 2),
weight = c(12.2, 11.3, 11.2, 10.1, 10.9, 11.9, 15.1)))
iteration3 <- list(year1 = data.frame(age = c(10, 11, 12, 13),
district = c(1, 2, 3, 4),
gender = c(2, 2, 1, 1),
weight = c(12.2, 11.3, 11.2, 10.1)),
year2 = data.frame(age = c(10, 11, 12, 13, 10, 10, 11, 12),
district = c(1, 2, 3, 4, 4, 3, 2, 2),
gender = c(2, 2, 1, 1, 2, 2, 1, 2),
weight = c(12.2, 11.3, 11.2, 10.1, 13.5, 12.8, 13.9, 14.9)))
df <- list(iteration1 = iteration1, iteration2 = iteration2, iteration3 = iteration3)
Expected output:
district mean of each district for all three iterations
1 20.2
2 24.96
3 24.46
4 14.6
for calculating my expected output I have followed two steps. in first step، I have weighted year 2 in each iteration by wtd.table(df$iteration1$year2$district,weights=df$iteration1$year2$weight) . I repeated this code for three times (because I have three iterations). here is my output:
1 2 3 4
25.3 23.5 11.2 10.1
1 2 3 4
23.1 11.3 38.2 10.1
1 2 3 4
12.2 40.1 24.0 23.6
in second step, I calculate mean of each district for three iterations manually: mean(25.3,23.1,12.2)
data.table approach
library(data.table)
library(questionr)
ans <- rbindlist(
lapply(df, function(x)
as.data.table(
questionr::wtd.table(x[["year2"]]$district,
weights = x[["year2"]]$weight))),
use.names = TRUE, fill = TRUE)
# Summarise
ans[, .(weight = mean(N, na.rm = TRUE)), by = .(district = V1)]
# district weight
# 1: 1 20.20000
# 2: 2 24.96667
# 3: 3 24.46667
# 4: 4 14.60000
Version 2
With updated columns based on TS's comment below
ans <- rbindlist(
lapply(df, function(x)
as.data.table(
questionr::wtd.table(x = x[["year2"]]$district,
y = x[["year2"]]$gender,
weights = x[["year2"]]$weight) ) ),
use.names = TRUE, fill = TRUE )
# Summarise
ans[, .(n = .N,
mean = mean(N, na.rm = TRUE),
sd = sd(N, na.rm = TRUE)),
by = .(district = V1, gender = V2)]
# district gender n mean sd
# 1: 1 1 3 8.433333 14.606962
# 2: 2 1 3 8.700000 7.582216
# 3: 3 1 3 7.466667 6.466323
# 4: 4 1 3 10.100000 0.000000
# 5: 1 2 3 11.766667 11.556095
# 6: 2 2 3 16.266667 8.602519
# 7: 3 2 3 17.000000 8.697126
# 8: 4 2 3 4.500000 7.794229
Combine the list of dataframes into one and calculate average weight using questionr::wtd.table for each district and iteration in year2. Finally, get aggregated mean for each district.
Using tidyverse you can do -
library(dplyr)
library(purrr)
map_df(df, ~bind_rows(.x, .id = 'year'), .id = 'iter') %>%
filter(year == 'year2') %>%
group_by(district, iter) %>%
summarise(result = questionr::wtd.table(district,weights=weight)) %>%
summarise(result = mean(result))
# district result
# <dbl> <dbl>
#1 1 20.2
#2 2 25.0
#3 3 24.5
#4 4 14.6
Related
Each day a company creates a value for category_1 and category_2.
A new company may enter the survey midway as company E appears on Dec 25.
Here are three days of data. So, two intervals: Dec 24-25 and Dec 25-26.
Question
For each category how many increase/decreases/no change were there over the 3 days?
For example, in cat1 A goes from a 2 to 1, B goes from a 3 to a 4, etc.
By hand I get:
cat1 - Up: 2, Down: 5, No change: 2
cat2 - Up: 6, Down: 2, No change: 1
How do I calculate the number of up/downs/no changes in an R Script?
library("tidyverse")
d1 <- as.Date("2022-12-24")
d2 <- as.Date("2022-12-25")
d3 <- as.Date("2022-12-26")
df <- tibble(
company = c(LETTERS[1:4], LETTERS[1:5], LETTERS[1:5]),
cat1 = c(2, 3, 4, 5, 1, 4, 5, 3, 2, 1, 4, 4, 2, 1),
cat2 = c(6, 7, 8, 9, 5, 5, 9, 10, 11, 6, 5, 10, 12, 13),
date = c(rep(d1, 4), rep(d2, 5), rep(d2, 5))
)
df
One approach using dplyr, assuming arranged data. Note: I changed the typo in date 3 to d3.
library(dplyr)
df %>%
group_by(company) %>%
mutate(cat1_change = cat1 - lag(cat1), cat2_change = cat2 - lag(cat2)) %>%
ungroup() %>%
summarize(type = c("up", "down", "no-change"),
across(ends_with("change"), ~
c(sum(.x > 0, na.rm=T), sum(.x < 0, na.rm=T), sum(.x == 0, na.rm=T))))
# A tibble: 3 × 3
type cat1_change cat2_change
<chr> <int> <int>
1 up 2 6
2 down 5 2
3 no-change 2 1
Data
df <- structure(list(company = c("A", "B", "C", "D", "A", "B", "C",
"D", "E", "A", "B", "C", "D", "E"), cat1 = c(2, 3, 4, 5, 1, 4,
5, 3, 2, 1, 4, 4, 2, 1), cat2 = c(6, 7, 8, 9, 5, 5, 9, 10, 11,
6, 5, 10, 12, 13), date = structure(c(19350, 19350, 19350, 19350,
19351, 19351, 19351, 19351, 19351, 19352, 19352, 19352, 19352,
19352), class = "Date")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -14L))
An option with data.table - grouped by company, loop over the 'cat' column, get the diff of adjacent elements, convert to sign, and rename with factor labels, melt to long format and reshape back to 'wide' format with dcast
library(data.table)
dcast(melt(setDT(df)[, lapply(.SD, \(x) factor(sign(diff(x)),
levels = c(-1, 0, 1), labels = c("down", "no-change", "up"))),
company, .SDcols = patterns("^cat")], id.var = "company",
value.name = "type"), type ~ paste0(variable, "_change"), length)
-output
type cat1_change cat2_change
1: down 5 2
2: no-change 2 1
3: up 2 6
I have a list of dataframes. It looks something like this:
df1 <- data.frame(Var1 = c(1, 7, 9, 4, 2),
Var2 = c(7, 2, 4, 4, 3),
Var3 = c(3, 6, 2, 0, 8))
df2 <- data.frame(Var1 = c(5, 6, 2, 2, 1),
Var2 = c(8, 6, 6, 7, 4),
Var3 = c(9, 0, 1, 3, 4))
df3.wxyz <- data.frame(Var1 = c("w", "x", "y", "z", 3, 7, 3, 6, 6),
Var2 = c(NA, NA, NA, NA, 7, 5, 8, 0, 2),
Var3 = c(NA, NA, NA, NA, 3, 3, 4, 1, 9))
df4 <- data.frame(Var1 = c(2, 7, 2, 4, 8),
Var2 = c(8, 3, 1, 7, 3),
Var3 = c(9, 1, 1, 6, 5))
df5.wxyz <- data.frame(Var1 = c("w", "x", "y", "z", 2, 7, 3, 1, 6),
Var2 = c(NA, NA, NA, NA, 7, 4, 8, 1, 9),
Var3 = c(NA, NA, NA, NA, 8, 0, 4, 1, 2))
df.list <- list(df1, df2, df3.wxyz, df4, df5.wxyz)
names(df.list) <- c("df1", "df2", "df3.wxyz", "df4", "df5.wxyz")
I would like to remove the first 4 rows of df3.wxyz and df5.wxyz from the list of dataframes as those contain information that I do not need. What I've tried is the following code, but instead of only removing the first 4 rows in df3.wxyz and df5.wxyz, it is removing the first 4 rows from every dataframe in my list. I'm not sure what the issue is.
df.list <- lapply(df.list, function(i){
ifelse(grepl("wxyz", names(df.list)), i <- i[-c(1:4), ], df.list)
i
})
This is what I would like to achieve:
df1 <- data.frame(Var1 = c(1, 7, 9, 4, 2),
Var2 = c(7, 2, 4, 4, 3),
Var3 = c(3, 6, 2, 0, 8))
df2 <- data.frame(Var1 = c(5, 6, 2, 2, 1),
Var2 = c(8, 6, 6, 7, 4),
Var3 = c(9, 0, 1, 3, 4))
df3.wxyz <- data.frame(Var1 = c(3, 7, 3, 6, 6),
Var2 = c(7, 5, 8, 0, 2),
Var3 = c(3, 3, 4, 1, 9))
df4 <- data.frame(Var1 = c(2, 7, 2, 4, 8),
Var2 = c(8, 3, 1, 7, 3),
Var3 = c(9, 1, 1, 6, 5))
df5.wxyz <- data.frame(Var1 = c(2, 7, 3, 1, 6),
Var2 = c(7, 4, 8, 1, 9),
Var3 = c(8, 0, 4, 1, 2))
df.list <- list(df1, df2, df3.wxyz, df4, df5.wxyz)
names(df.list) <- c("df1", "df2", "df3.wxyz", "df4", "df5.wxyz")
You can try,
df.list[grepl('wxyz', names(df.list))] <- lapply(df.list[grepl('wxyz', names(df.list))], na.omit)
You can try na.omit like below
> Map(na.omit,df.list)
$df1
Var1 Var2 Var3
1 1 7 3
2 7 2 6
3 9 4 2
4 4 4 0
5 2 3 8
$df2
Var1 Var2 Var3
1 5 8 9
2 6 6 0
3 2 6 1
4 2 7 3
5 1 4 4
$df3.wxyz
Var1 Var2 Var3
5 3 7 3
6 7 5 3
7 3 8 4
8 6 0 1
9 6 2 9
$df4
Var1 Var2 Var3
1 2 8 9
2 7 3 1
3 2 1 1
4 4 7 6
5 8 3 5
$df5.wxyz
Var1 Var2 Var3
5 2 7 8
6 7 4 0
7 3 8 4
8 1 1 1
9 6 9 2
I have a cvs file that has the following structure (minimum example):
ID Variable Vector
1 a [0,0,0]
2 a [1,2,3]
1 a [1,1,2]
2 a [1,2,3]
1 b [0,0,0]
2 b [1,1,1]
1 b [0,0,1]
2 b [3,5,7]
I would like to calculate the mean vector for each combination of parameters (in this case, ID and Variable). That is, I want to obtain a dataframe like the following one:
ID Variable Vector
1 a [0.5,0.5,1]
2 a [1,2,3]
1 b [0,0,0.5]
2 b [2,3,4]
I have generated this csv file with Python, that's why I have that structure with brackets. But I do not know how to start to do this using R. It doesn't seem to be a common data structure.
Update:
Vector variable structure (obtained from dput(head(data, 8))
Vector = c("[3, 16, 14, 5, 6, 13, 17, 7, 13, 6]",
"[7, 12, 6, 10, 6, 5, 16, 9, 19, 10]", "[4, 13, 4, 11, 6, 15, 17, 10, 12, 8]",
"[18, 11, 16, 8, 10, 10, 7, 4, 9, 7]", "[9, 9, 10, 17, 8, 13, 3, 13, 8, 10]",
"[17, 12, 7, 13, 6, 13, 8, 9, 5, 10]", "[9, 6, 14, 10, 8, 4, 8, 14, 15, 12]",
"[7, 13, 8, 10, 16, 8, 13, 13, 8, 4]")), row.names = c(NA, 8L
), class = "data.frame")
Assuming the 'Vector' column is a list, after grouping by 'ID', 'Variable', we reduce the 'Vector' by adding (+) the corresponding elements together and then divide by the total number of elements (n()) in that group
library(dplyr)
library(purrr)
out <- df1 %>%
group_by(ID, Variable) %>%
summarise(Vector = list(reduce(Vector, `+`)/n()), .groups = 'drop')
-output
out
# A tibble: 4 x 3
# ID Variable Vector
# <dbl> <chr> <list>
#1 1 a <dbl [3]>
#2 1 b <dbl [3]>
#3 2 a <dbl [3]>
#4 2 b <dbl [3]>
out$Vector
#[[1]]
#[1] 0.5 0.5 1.0
#[[2]]
#[1] 0.0 0.0 0.5
#[[3]]
#[1] 1 2 3
#[[4]]
#[1] 2 3 4
If the column 'Vector' is a character string, an option is to extract the numeric part into a list
library(stringr)
out <- df1 %>%
group_by(ID, Variable) %>%
summarise(Vector = list((str_extract_all(Vector, "\\d+") %>%
map(as.numeric) %>% reduce(`+`))/n()), .groups = 'drop')
data
df1 <- structure(list(ID = c(1, 2, 1, 2, 1, 2, 1, 2), Variable = c("a",
"a", "a", "a", "b", "b", "b", "b"), Vector = structure(list(c(0,
0, 0), c(1, 2, 3), c(1, 1, 2), c(1, 2, 3), c(0, 0, 0), c(1, 1,
1), c(0, 0, 1), c(3, 5, 7)), class = "AsIs")), class = "data.frame",
row.names = c(NA,
-8L))
Here is data set 'before' and 'after' shifting.
# Data set 'before'
df_before <- t(data.table(
x = c(1, 2, 3, 4, 5),
y = c(0, 6, 7, 8, 9),
z = c(0, 0, 11, 12, 13)))
# Shift operation
# ...
# Data set 'after'
df_after <- t(data.table(
x = c(1, 2, 3, 4, 5),
y = c(6, 7, 8, 9, NA),
z = c(11, 12, 13, NA, NA)))
How to make this kind of shifting on +1 cell only for all rows?
Thanks!
Something like this? Just start the rows always shifted by one and reset their length. The latter adds NAs.
t(sapply(1:nrow(DF), function(x) `length<-`(DF[x, x:ncol(DF)], ncol(DF))))
# [,1] [,2] [,3] [,4] [,5]
# [1,] 1 2 3 4 5
# [2,] 6 7 8 9 NA
# [3,] 11 12 13 NA NA
Data
DF <- structure(c(1, 0, 0, 2, 6, 0, 3, 7, 11, 4, 8, 12, 5, 9, 13), .Dim = c(3L,
5L), .Dimnames = list(c("x", "y", "z"), NULL))
Taking a guess at the logic:
t(apply(df_before, 1, function(x) `length<-`(x[x != 0], ncol(df_before))))
[,1] [,2] [,3] [,4] [,5]
x 1 2 3 4 5
y 6 7 8 9 NA
z 11 12 13 NA NA
You can un-transpose the df_before data.frame then use the lead function from dplyr
to shift the columns
library(data.table)
library(dplyr)
df_before <- data.table(
x = c(1, 2, 3, 4, 5),
y = c(0, 6, 7, 8, 9),
z = c(0, 0, 11, 12, 13))
df_after <- t(data.table(
x = c(1, 2, 3, 4, 5),
y = c(6, 7, 8, 9, NA),
z = c(11, 12, 13, NA, NA)))
df_before[] <-lapply(1:ncol(df_before), function(x){
dplyr::lead(df_before[[x]],n= x-1)
})
If you need to transpose the data after this step:
df_after2 <- t(df_before)
all.equal(df_after,df_after2) # TRUE
I have imported a .sav file with Haven but where I am stuck is that I cant seem to work out how to print the label names in place or, with the label codings. Labels: 1 = unemployed, 2 = looking etc.
Employment <- select(well_being_df, EmploymentStatus, Gender) %>% <group_by(EmploymentStatus) %>% summarise_all(funs(mean, n = n(), sd,min(.,is.na = TRUE), max(.,is.na = TRUE)))
# A tibble: 5 x 6
EmploymentStatus mean n sd min max
<dbl+lbl> <dbl> <int> <dbl> <dbl> <dbl>
1 1 1.67 12 0.492 1 2
2 2 1.17 6 0.408 1 2
3 3 1.8 85 0.431 1 3
4 4 1.5 62 0.504 1 2
5 5 1.5 4 0.577 1 2
Ideally:
# A tibble: 5 x 6
EmploymentStatus mean n sd min max
<dbl+lbl> <dbl> <int> <dbl> <dbl> <dbl>
1 1 Unemployed 1.67 12 0.492 1 2
2 2 Looking 1.17 6 0.408 1 2
3 3 Etc 1.8 85 0.431 1 3
4 4 1.5 62 0.504 1 2
5 5 1.5 4 0.577 1 2
dput(head(well_being_df, 10))
structure(list(Age = c(22, 20, 23, 20, 25, 18, 24, 21, 21, 30.7344197070233
), Gender = structure(c(2, 2, 1, 2, 1, 2, 2, 2, 2, 1), labels = c(Male = 1,
Female = 2, Transgender = 3), class = "labelled"), EmploymentStatus = structure(c(3,
1, 4, 3, 3, 3, 3, 4, 3, 4), labels = c(`Unemployed but not looking` = 1,
`Unemployed and looking` = 2, `Part-time` = 3, `Full-time` = 4,
Retired = 5), class = "labelled"), Cognition1 = structure(c(6,
3, 6, 5, 9, 6, 4, 4, 7, 5), labels = c(`Provides nothing that you want` = 0,
`Provides half of what you want` = 5, `Provides all that you want` = 10
), class = "labelled"), Cognition2 = structure(c(7, 3, 8,
5, 8, 5, 5, 7, 7, 3), labels = c(`Far below average` = 0,
`About Average` = 5, `Far above average` = 10), class = "labelled"),
Cognition3 = structure(c(6, 5, 4, 5, 6, 5, 5, 5, 5, 5), labels = c(`Far less than you deserve` = 0,
`About what you deserve` = 5, `Far more than you deserve` = 10
), class = "labelled"), Cognition4 = structure(c(7, 3, 6,
2, 8, 3, 3, 5, 6, 2), labels = c(`Far less than you need` = 0,
`About what you need` = 5, `Far more than you need` = 10), class = "labelled"),
Cognition5 = structure(c(10, 9, 6, 3, 7, 2, 2, 0, 4, 0), labels = c(`Far less than expected` = 0,
`About as expected` = 5, `Far more than expected` = 10), class = "labelled"),
Cognition6 = structure(c(8, 6, 0, 3, 3, 8, 9, 10, 5, 10), labels = c(`Far more than it will in the future` = 0,
`About what you expect in the future` = 5, `Far less than what the future will offer` = 10
), class = "labelled"), Cognition7 = structure(c(9, 7, 10,
5, 6, 2, 3, 0, 8, 3), labels = c(`Far below previous best` = 0,
`Equals previous best` = 5, `Far above previous best` = 10
), class = "labelled")), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
Employment <- select(well_being_df, EmploymentStatus, Gender) %>%
mutate(EmploymentStatus = labelled::to_factor(EmploymentStatus)) %>% # use labelled package
group_by(EmploymentStatus) %>%
summarise_all(funs(mean, n = n(), sd,min(.,is.na = TRUE), max(.,is.na = TRUE)))