Using map() function to apply for each element - r

I need, with the help of the map() function, apply the above for each element
How can I do so?

As dt is of class data.table, you can make a vector of columns of interest (i.e. your items; below I use grepl on the names), and then apply your weighting function to each of those columns using .SD and .SDcols, with by
qs = names(dt)[grepl("^q", names(dt))]
dt[, (paste0(qs,"wt")):=lapply(.SD, \(q) 1/(sum(!is.na(q))/.N)),
.(sex, education_code, age), .SDcols = qs]

As mentioned in the comments, you miss a dt <- in your dt[, .(ID, education_code, age, sex, item = q1_1)] which makes the column item unavailable in the following line dt[, no_respond := is.na(item)].
Your weighting scheme is not absolutely clear to me however, assuming you want to do what is done in your code here, I would go with dplyr solution to iterate over columns.
# your data without no_respond column and correcting missing value in q2_3
dt <- data.table::data.table(
ID = c(1,2,3,4, 5, 6, 7, 8, 9, 10),
education_code = c(20,50,20,60, 20, 10,5, 12, 12, 12),
age = c(87,67,56,52, 34, 56, 67, 78, 23, 34),
sex = c("F","M","M","M", "F","M","M","M", "M","M"),
q1_1 = c(NA,1,5,3, 1, NA, 3, 4, 5,1),
q1_2 = c(NA,1,5,3, 1, 2, NA, 4, 5,1),
q1_3 = c(NA,1,5,3, 1, 2, 3, 4, 5,1),
q1_text = c(NA,1,5,3, 1, 2, 3, 4, 5,1),
q2_1 = c(NA,1,5,3, 1, 2, 3, 4, 5,1),
q2_2 = c(NA,1,5,3, 1, 2, 3, 4, 5,1),
q2_3 = c(NA,1,5,3, 1, NA, NA, 4, 5,1),
q2_text = c(NA,1,5,3, 1, NA, 3, 4, 5,1))
dt %>%
group_by(sex, education_code, age) %>% #groups the df by sex, education_code, age
add_count() %>% #add a column with number of rows in each group
mutate(across(starts_with("q"), #for each column starting with "q"
~ 1/(sum(!is.na(.))/n), #create a new column following your weight calculation
.names = '{.col}_wgt')) %>% #naming the new column with suffix "_wgt" to original name
ungroup()

Related

R: Creating new column to represent hi/mid/low bins by mean and standard deviation

I've got a batch of survey data that I'd like to be able to subset on a few specific columns which have 0-10 scale data (e.g. Rank your attitude towards x as 0 to 10) so that I can plot using using ggplot() + facet_grid. Faceting will be using 3 hi/med/low bins calculated as +1 / -1 standard deviation above the mean. I have working code, which splits the overall dataframe into 3 parts like so:
# Generate sample data:
structure(list(Q4 = c(2, 3, 3, 5, 4, 3), Q5 = c(1, 3, 3, 3, 2,
2), Q6 = c(4, 3, 3, 3, 4, 4), Q7 = c(4, 2, 3, 5, 5, 5), Q53_1 = c(5,
8, 4, 5, 4, 5)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
# Aquire Q53_1 data as factors
political_scale <- factor(climate_experience_data$Q53_1, levels = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
# Generate thresholds based on mean and standard deviation thresholds
low_threshold <- round(mean(as.numeric(political_scale, na.rm = T)) - sd(as.numeric(political_scale)), digits = 0)
high_threshold <- round(mean(as.numeric(political_scale, na.rm = T)) + sd(as.numeric(political_scale)), digits = 0)
# Generate low/med/high bins based on Mean and SD
political_lr_low <- filter(climate_experience_data, Q53_1 <= low_threshold)
political_lr_mid <- filter(climate_experience_data, Q53_1 < high_threshold & Q53_1 > low_threshold)
political_lr_high <- filter(climate_experience_data, Q53_1 >= high_threshold)
What I've realised is that this approach really doesn't lend itself to faceting. What I suspect is that I need to use a combination of mutate() across() where() and group_by() to add data to a new column Q53_scale with "hi" "med" "low" based on where Q53_1 values fall in relation to those low/high thresholds (e.g. SD +1 over mean and -1 under mean). My first few dozen attempts have fallen short - has anyone managed to use sd() to bin data for faceting in this way?
library(tidyverse)
climate_experience_data <- structure(list(Q4 = c(2, 3, 3, 5, 4, 3), Q5 = c(
1, 3, 3, 3, 2,
2
), Q6 = c(4, 3, 3, 3, 4, 4), Q7 = c(4, 2, 3, 5, 5, 5), Q53_1 = c(
5,
8, 4, 5, 4, 5
)), row.names = c(NA, -6L), class = c(
"tbl_df",
"tbl", "data.frame"
))
climate_experience_data %>%
mutate(
bin = case_when(
Q53_1 > mean(Q53_1) + sd(Q53_1) ~ "high",
Q53_1 < mean(Q53_1) - sd(Q53_1) ~ "low",
TRUE ~ "medium"
) %>% factor(levels = c("low", "medium", "high"))
) %>%
ggplot(aes(Q4, Q5)) +
geom_point() +
facet_grid(~bin)
Created on 2022-03-10 by the reprex package (v2.0.0)

How to define score of popularity for list of tags in R?

There is a dataset where each object has a list of tags of categories comma separated. I would like to have aggregated categories score per object based on categories' popularities. I can define the sum, min, and max of popularities but it's not clear to me how an aggregated score can be calculated.
library(tidyverse)
library(tibble)
library(stringr)
# 1. Data
df <- tribble(
~object, ~category,
1, "Software, Model, Cloud",
2, "Model",
3, "Cloud, Software",
4, "Train, Test, Model",
5, "Test, Model"
)
# 2. List of categories
list_category <- trimws(unlist(str_split(df$category, ",")))
# 3. Categories popularity
data.frame(category = list_category) %>%
group_by(category) %>%
summarise(n_count = n()) %>%
arrange(-n_count) %>%
ungroup()
# 4. Outcome with undefined 'score_category' feature that I'd like to know how to score
tribble(
~object, ~sum_category, ~min_category, ~max_category, ~score_category,
1, sum(c(2, 4, 2)), min(c(2, 4, 2)), max(c(2, 4, 2)), NA,
2, sum(c(4)), min(c(4)), max(c(4)), NA,
3, sum(c(2, 2)), min(c(2, 2)), max(c(2, 2)), NA,
4, sum(c(1, 2, 4)), min(c(1, 2, 4)), max(c(1, 2, 4)), NA,
5, sum(c(2, 4)), min(c(2, 4)), max(c(2, 4)), NA
)
Any ideas and code are welcome!

joining two dataframes on matching values of two common columns R

I have a two dataframes A and B that both have multiple columns. They share the common columns "week" and "store". I would like to join these two dataframes on the matching values of the common columns.
For example this is a small subset of the data that I have:
A = data.frame(retailer = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
store = c(5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6),
week = c(2021100301, 2021092601, 2021091901, 2021091201, 2021082901, 2021082201, 2021081501, 2021080801,
2021080101, 2021072501, 2021071801, 2021071101, 2021070401, 2021062701, 2021062001, 2021061301),
dollars = c(121817.9, 367566.7, 507674.5, 421257.8, 453330.3, 607551.4, 462674.8,
464329.1, 339342.3, 549271.5, 496720.1, 554858.7, 382675.5,
373210.9, 422534.2, 381668.6))
and
B = data.frame(
week = c("2020080901", "2017111101", "2017061801", "2020090701", "2020090701", "2020090701",
"2020091201","2020082301", "2019122201", "2017102901"),
store = c(14071, 11468, 2428, 17777, 14821, 10935, 5127, 14772, 14772, 14772),
fill = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
)
I would like to join these two tables on the matching week AND store values in order to incorporate the "fill" column from B into A. Where the values don't match, I would like to have a label "0" in the fill column, instead of a 1. Is there a way I can do this? I am not sure which join to use as well, or if "merge" would be better for this? Essentially I am NOT trying to get rid of any rows that do not have the matching values for the two common columns. Thanks for any help!
We may do a left_join
library(dplyr)
library(tidyr)
A %>%
mutate(week = as.character(week)) %>%
left_join(B) %>%
mutate(fill = replace_na(fill, 0))

Adding rows to make a full long dataset for longitudinal data analysis

I am working with a long-format longitudinal dataset where each person has 1, 2 or 3 time points. In order to perform certain analyses I need to make sure that each person has the same number of rows even if it consists of NAs because they did not complete the certain time point.
Here is a sample of the data before adding the rows:
structure(list(Values = c(23, 24, 45, 12, 34, 23), P_ID = c(1,
1, 2, 2, 2, 3), Event_code = c(1, 2, 1, 2, 3, 1), Site_code = c(1,
1, 3, 3, 3, 1)), class = "data.frame", row.names = c(NA, -6L))
This is the data I aim to get after adding the relevant rows:
structure(list(Values = c(23, 24, NA, 45, 12, 34, 23, NA, NA),
P_ID = c(1, 1, 1, 2, 2, 2, 3, 3, 3), Event_code = c(1, 2,
3, 1, 2, 3, 1, 2, 3), Site_code = c(1, 1, 1, 3, 3, 3, 1,
1, 1)), class = "data.frame", row.names = c(NA, -9L))
I want to come up with code that would automatically add rows to the dataset conditionally on whether the participant has had 1, 2 or 3 visits. Ideally it would make rest of data all NAs while copying Participant_ID and site_code but if not possible I would be satisfied just with creating the right number of rows.
We could use fill after doing a complete
library(dplyr)
library(tidyr)
ExpandedDataset %>%
complete(P_ID, Event_code) %>%
fill(Site_code)
I came with quite a long code, but you could group it in a function and make it easier:
Here's your dataframe:
df <- data.frame(ID = c(rep("P1", 2), rep("P2", 3), "P3"),
Event = c("baseline", "visit 2", "baseline", "visit 2", "visit 3", "baseline"),
Event_code = c(1, 2, 1, 2, 3, 1),
Site_code = c(1, 1, 2, 2, 2, 1))
How many records you have per ID?
values <- summary(df$ID)
What is the maximum number of records for a single patient?
target <- max(values)
Which specific patients have less records than the maximum?
uncompliant <- names(which(values<target))
And how many records do you have for those patients who have missing information?
rowcount <- values[which(values<target)]
So now, let's create the vectors of the data frame we will add to your original one. First, IDs:
IDs <- vector()
for(i in 1:length(rowcount)){
y <- rep(uncompliant[i], target - rowcount[i])
IDs <- c(IDs, y)
}
And now, the sitecodes:
SC <- vector()
for(i in 1:length(rowcount)){
y <- rep(unique(df$Site_code[which(df$ID == uncompliant[i])]), target - rowcount[i])
SC <- c(SC, y)
}
Finally, a data frame with the values we will introduce:
introduce <- data.frame(ID = IDs, Event = rep(NA, length(IDs)),
Event_code = rep(NA, length(IDs)),
Site_code = SC)
Combine the original dataframe with the new values to be added and sort it so it looks nice:
final <- as.data.frame(rbind(df, introduce))
final <- final[order(v$ID), ]

Separate a column with uneven/unequal strings and with no delimiters

How would I separate a column like this where the data has delimiters but the rest does not and it has some unequal strings?
Input:
id
142 TM500A2013PISA8/22/17BG
143 TM500CAGE2012QUDO8/22/1720+
Output:
category site garden plot year species date portion
142 TM 500 A 2013 PISA 8/22/17 BG
143 TM 500 CAGE 2012 QUDO 8/22/17 20+
I poked around other questions and tried something that may work if it was an equal string ie:
>df <- avgmass %>% separate(id, c("site", "garden", "plot", "year",
"species", "sampledate", "portion"),sep=cumsum(c(2,3,3,4,4,5)))
But as the plot id is either A, B or CAGE; the date has "/" - I am not sure how to approach it.
As I am relatively new to R, I tried searching for more details on how to use the sep argument but to no avail... Thank you for your help.
The code below may work for you, assuming that the "site", "garden" and "species" columns are of a fixed width.
df <- df %>%
mutate(site = substr(id, 1, 2),
garden = substr(id, 3, 5),
plot = ifelse(substr(id, 6, 9) == "CAGE", substr(id, 6, 9), substr(id, 6, 6)),
year = ifelse(substr(id, 6, 9) == "CAGE", substr(id, 10, 13), substr(id, 7, 10)),
species = ifelse(substr(id, 6, 9) == "CAGE", substr(id, 14, 17), substr(id, 11, 14)),
sampledate = ifelse(substr(id, 6, 9) == "CAGE", substr(id, 18, nchar(id)), substr(id, 15, nchar(id)))) %>%
separate(sampledate, into = c("m","d","y"), sep = "/") %>%
mutate(portion = substr(y, 3, nchar(y)),
sampledate = as.Date(paste(m, d, substr(y, 1, 2), sep = "-"), format = "%m-%d-%y"),
m = NULL,
d = NULL,
y = NULL)

Resources