Conditional str_remove based on data frame column - r

I have a dataframe (pasted below), in which I am trying to set to blank the value of one column based on the value of another column. The idea is that if X6 equals Nbre CV or if X6equals Nbre BVD then I want X6for that row to be blank.
Unfortunately using the following code the entire X6 column turns to NA or missing.
extractstack <- extractstack %>%
mutate(across(everything(), as.character) %>%
mutate(X6 = if_else(X6 == `Nbre CV`, str_remove(X6, `Nbre CV`), X6)) %>%
mutate(X6 = if_else(X6 == `Nbre CV`, str_remove(X6, `Nbre BVD`), X6)))
structure(list(X1 = c("", "", "40", "", "", "41", "", "", "42",
"", "", "43", "", "", "44", ""), X2 = c("", "", "EP. KAPALA",
"", "", "INST. MOTULE", "", "", "CABANE BABOA", "", "", "CABANE BANANGI",
"", "", "E.P.BINZI", ""), X3 = c("", "", "MOBATI-BOYELE", "",
"", "MOBATI-BOYELE", "", "", "MOBATI-BOYELE", "", "", "AVURU-GATANGA",
"", "", "AVURU-GATANGA", ""), X4 = c("", "", "BOGBASA", "", "",
"BOSOBEA", "", "", "BOSOBEA", "", "", "BANANGI", "", "", "GURUZA",
""), X5 = c("", "", "", "", "", "MOBENGE", "", "", "BABOA", "",
"", "DIFONGO", "", "", "DULIA", ""), X6 = c("", "", "BOGBASA",
"", "", "", "1", "", "", "1", "", "", "1", "", "", "1"), X7 = c("1",
"", "", "1", "", "", "4", "", "", "1", "", "", "1", "", "", "5"
), X8 = c("2", "", "", "2", "", "", "510 110", "", "", "510 111",
"", "", "510 112", "", "", "510 113"), X9 = c("510 108", "",
"", "510 109", "", "", "A - D", "", "", "A", "", "", "A", "",
"", "A - E"), page = c("4", "4", "4", "4", "5", "5", "5", "5",
"5", "5", "5", "5", "5", "5", "5", "5"), Plage = c("A - B", NA,
NA, "A - B", NA, NA, "A - D", NA, NA, "A", NA, NA, "A", NA, NA,
"A - E"), `Code SV` = c("510 108", NA, NA, "510 109", NA, NA,
"510 110", NA, NA, "510 111", NA, NA, "510 112", NA, NA, "510 113"
), `Nbre BVD` = c("2", NA, NA, "2", NA, NA, "4", NA, NA, "1",
NA, NA, "1", NA, NA, "5"), `Nbre CV` = c("1", NA, NA, "1", NA,
NA, "1", NA, NA, "1", NA, NA, "1", NA, NA, "1")), class = "data.frame", row.names = c(NA,
-16L))

That's basically Chris Ruehlemann's answer (I don't know why he removed it, I would remove this one for the original one):
library(dplyr)
extractstack %>%
mutate(across(everything(), as.character),
X6 = coalesce(ifelse(X6 == `Nbre BVD` | X6 == `Nbre CV`, "", X6), X6))
compares X6 with the columns Nbre BVD and Nbre CV. If there is matching content, X6 will be changed to an empty string "", else X6 stays unchanged. But for your given data, this code doesn't replace anything, since there are simply no matches in X6 with Nbre BVD and Nbre CV besides NA-values.

Related

How to wrangle the dataset in R: reshaping and creating new columns with given information

I have a dataset that looks like below,
structure(list(nonyeasted_19 = c("Force (N)", "0", "-0.0077",
"0.0023", "-0.0707", "-0.2155", "-0.2026", "-0.0628", "-0.0481",
"-0.0601", "0.0302", "0.0475", "-0.0176", "0.008", "0.0569",
"0.0242", "0.0003", "0.0295", "0.028", "-0.0221", "-0.0333",
"0.0034", "0.004", "-0.0219", "-0.0216", "-0.0261"), nonyeasted_19.1 = c("Distance (m)",
"0", "0", "0", "0", "0", "0", "0.000002", "0.000004", "0.000006",
"0.000008", "0.00001", "0.000012", "0.000014", "0.000016", "0.000018",
"0.00002", "0.000022", "0.000024", "0.000026", "0.000028", "0.00003",
"0.000032", "0.000034", "0.000036", "0.000038"), nonyeasted_19.2 = c("Time (sec)",
"0", "0.002", "0.004", "0.006", "0.008", "0.01", "0.012", "0.014",
"0.016", "0.018", "0.02", "0.022", "0.024", "0.026", "0.028",
"0.03", "0.032", "0.034", "0.036", "0.038", "0.04", "0.042",
"0.044", "0.046", "0.048"), nonyeasted_19.3 = c("Status", "101",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"), yeasted_01 = c("Force (N)",
"0", "0.0024", "0.0307", "-0.0487", "-0.2063", "-0.1928", "-0.0421",
"-0.0278", "-0.0586", "0.0251", "0.0373", "-0.0084", "0.0597",
"0.091", "0.0246", "0.0318", "", "", "", "", "", "", "", "",
""), yeasted_01.1 = c("Distance (m)", "0", "0", "0", "0", "0",
"0", "0", "0.000001", "0.000003", "0.000005", "0.000007", "0.000009",
"0.000011", "0.000013", "0.000015", "0.000017", "", "", "", "",
"", "", "", "", ""), yeasted_01.2 = c("Time (sec)", "0", "0.002",
"0.004", "0.006", "0.008", "0.01", "0.012", "0.014", "0.016",
"0.018", "0.02", "0.022", "0.024", "0.026", "0.028", "0.03",
"", "", "", "", "", "", "", "", ""), yeasted_01.3 = c("Status",
"101", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "", "", "", "", "", "", "", "", "")), class = "data.frame", row.names = c(NA,
-26L))
Every four columns are in one group, and the group names are in the first row, while the column names are in the second row. I wonder whether there are any ways to concatenate the groups vertically and create two new columns with the group name row, where column 1 contains the text before the underscore and column 2 contains the text after the underscore.
I tried to use tidyverse, but after read.csv(), the variable names could not be preserved.
one approach:
sample data (example_data.csv):
group_A,group_A,group_B,group_B
var_1,var_2,var_3,var_4
143,897,234,382
Code:
library(readr) ## for the read_lines function
library(tidyr) ## wrangling (pivoting etc.)
## read csv but skip first line (containing group names):
df <- read.csv('path/to/example_data.csv',skip = 1)
## read first line of csv and convert it to vector of group names:
group_names <- read_lines('path/to/example_data.csv', n_max = 1) %>%
strsplit(',') %>% unlist
## change names of dataframe df to: variable_name;group_name
names(df) <- paste(group_names, names(df), sep = ';')
## wrangle data (for documentation see https://tidyr.tidyverse.org/ )
df %>%
pivot_longer(everything(), names_to = 'group_var', values_to = 'value') %>%
separate(group_var, into = c('group', 'var'), sep = ';') %>%
separate(group, into = c('yeasted_status', 'index'), sep='_') %>%
pivot_wider(names_from = var, values_from = value)
Result:
## A tibble: 2 x 6
# yeasted_status index var_1 var_2 var3 var4
# <chr> <chr> <int> <int> <int> <int>
# 1 group A 143 897 NA NA
# 2 group B NA NA 234 382
edit
or, if df is the dataframe derived from your dput output:
df[-1,] %>%
pivot_longer(everything(),names_to = 'group_var', values_to = 'value') %>% head %>%
mutate(ID = paste(row_number(),group_var)) %>%
separate(group_var, into = c('group', 'var'), sep = ';') %>%
separate(group, into = c('yeasted_status', 'index'), sep='_') %>%
mutate(value = as.double(value)) %>%
pivot_wider(id_cols = c(ID, yeasted_status,index), names_from = var, values_from = value) %>%
select(-ID)

right_join and mutate does not preserve the index in R

I am Mapping column_data to master and if column value is present in master than it saves it Key
ex:Parent for P and Child for C
Problem is i am getting the output but output is indexed differently
DATA
column_data <- c("", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "P", "C", "C")
master <- list("Parent" = c("P"),
"Child" = c("C")
)
CODE
library(dplyr)
df <- data.frame("column" = column_data)
df <-stack(master) %>%
type.convert(as.is = TRUE) %>%
right_join(df, by = c('values' = 'column')) %>%
mutate(output = coalesce(ind, values))
This Should be the output:
structure(list(values = c("", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "P", "C", "C"), ind = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Parent",
"Child", "Child"), output = c("", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "Parent", "Child", "Child")), class = "data.frame", row.names = c(NA,
-19L))
but instead i get this as output:
structure(list(values = c("P", "C", "C", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", ""), ind = c("Parent",
"Child", "Child", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), output = c("Parent", "Child", "Child", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "")), row.names = c(NA,
-19L), class = "data.frame")
With dplyr, if you do a right_join(x, y) then the result will include a subset of the matched rows for x, then unmatched rows for y.
From R documentation on mutating joins, the value returned will be:
An object of the same type as x. The order of the rows and columns of
x is preserved as much as possible. The output has the following
properties:
For inner_join(), a subset of x rows. For left_join(), all x rows. For
right_join(), a subset of x rows, followed by unmatched y rows. For
full_join(), all x rows, followed by unmatched y rows.
That is why you have the 3 matched rows at the beginning of your resulting data.frame.
To get the desired result preserving the row order of df, try a left_join as follows:
df2 <- stack(master) %>%
type.convert(as.is = TRUE)
df %>%
left_join(df2, by = c('column' = 'values')) %>%
mutate(output = coalesce(ind, column))
Output
column ind output
1 <NA>
2 <NA>
3 <NA>
4 <NA>
5 <NA>
6 <NA>
7 <NA>
8 <NA>
9 <NA>
10 <NA>
11 <NA>
12 <NA>
13 <NA>
14 <NA>
15 <NA>
16 <NA>
17 P Parent Parent
18 C Child Child
19 C Child Child

converting multiple columns from wide to long using pivot_longer

I get an error message when I want to convert multiple columns from wide to long with pivot_longer
I have code which converts from wide to long with gather but I have to do this column by column. I want to use pivot_longer to gather multiple columns rather than column by column.
This is some input data:
structure(list(id = c("81", "83", "85", "88", "1", "2"), look_work = c("yes",
"yes", "yes", "yes", "yes", "yes"), current_work = c("no", "yes",
"no", "no", "no", "no"), before_work = c("no", "NULL", "yes",
"yes", "yes", "yes"), keen_move = c("yes", "yes", "no", "no",
"no", "no"), city_size = c("village", "more than 500k inhabitants",
"more than 500k inhabitants", "village", "city up to 20k inhabitants",
"100k - 199k inhabitants"), gender = c("male", "female", "female",
"male", "female", "male"), age = c("18 - 24 years", "18 - 24 years",
"more than 50 years", "18 - 24 years", "31 - 40 years", "more than 50 years"
), education = c("secondary", "vocational", "secondary", "secondary",
"secondary", "secondary"), hf1 = c("", "", "", "1", "1", "1"),
hf2 = c("", "1", "1", "", "", ""), hf3 = c("", "", "", "",
"", ""), hf4 = c("", "", "", "", "", ""), hf5 = c("", "",
"", "", "", ""), hf6 = c("", "", "", "", "", ""), ac1 = c("",
"", "", "", "", "1"), ac2 = c("", "1", "1", "", "1", ""),
ac3 = c("", "", "", "", "1", ""), ac4 = c("", "", "", "",
"", ""), ac5 = c("", "", "", "", "", ""), ac6 = c("", "",
"", "", "", ""), cs1 = c("", "", "", "", "", ""), cs2 = c("",
"1", "1", "", "1", ""), cs3 = c("", "", "", "", "", "1"),
cs4 = c("", "", "", "1", "", ""), cs5 = c("", "", "", "",
"", ""), cs6 = c("", "", "", "", "", ""), cs7 = c("", "",
"", "", "", ""), cs8 = c("", "", "", "", "", ""), se1 = c("",
"", "1", "1", "", ""), se2 = c("", "", "", "", "1", ""),
se3 = c("", "1", "", "", "1", "1"), se4 = c("", "", "", "",
"", ""), se5 = c("", "", "", "", "", ""), se6 = c("", "",
"", "", "", ""), se7 = c("", "", "", "", "", ""), se8 = c("",
"", "", "1", "", "")), row.names = c(NA, 6L), class = "data.frame")
The code using gather is:
df1 <- df %>%
gather(key = "hf_com", value = "hf_com_freq", hf_<:hf6) %>%
gather(key = "ac_com", value = "ac_com_freq", ac1:ac6) %>%
filter(substring(hf_com, 3) == substring(ac_com, 3))
df1 <- df1 %>%
gather(key = "curr_sal", value = "curr_sal_freq", cs1:cs8) %>%
gather(key = "exp_sal", value = "exp_sal_freq", se1:se8) %>%
filter(substring(curr_sal, 3) == substring(exp_sal, 3))
The code using pivot_longer is:
df_longer <- df %>%
pivot_longer(
cols = starts_with("hf"),
names_to = "hf_com",
values_to = "hf_freq",
names_prefix = "hf",
na.rm = TRUE)
The expected results which I get with gather are:
structure(list(id = c("81", "83", "85", "88", "1", "2"), look_work = c("yes",
"yes", "yes", "yes", "yes", "yes"), current_work = c("no", "yes",
"no", "no", "no", "no"), before_work = c("no", "NULL", "yes",
"yes", "yes", "yes"), keen_move = c("yes", "yes", "no", "no",
"no", "no"), city_size = c("village", "more than 500k inhabitants",
"more than 500k inhabitants", "village", "city up to 20k inhabitants",
"100k - 199k inhabitants"), gender = c("male", "female", "female",
"male", "female", "male"), age = c("18 - 24 years", "18 - 24 years",
"more than 50 years", "18 - 24 years", "31 - 40 years", "more than 50 years"
), education = c("secondary", "vocational", "secondary", "secondary",
"secondary", "secondary"), hf_com = c("hf1", "hf1", "hf1", "hf1",
"hf1", "hf1"), hf_com_freq = c("", "", "", "1", "1", "1"), ac_com = c("ac1",
"ac1", "ac1", "ac1", "ac1", "ac1"), ac_com_freq = c("", "", "",
"", "", "1"), curr_sal = c("cs1", "cs1", "cs1", "cs1", "cs1",
"cs1"), curr_sal_freq = c("", "", "", "", "", ""), exp_sal = c("se1",
"se1", "se1", "se1", "se1", "se1"), exp_sal_freq = c("", "",
"1", "1", "", "")), row.names = c(NA, 6L), class = "data.frame")
With pivot_longer, I get the following error message:
Error in pivot_longer(., cols = starts_with("hf"), names_to = "hf_com", :
unused argument (na.rm = TRUE)
Also, if there is no solution with pivot_longer, then a solution with data.table would be appreciated.
I have solved the problem:
This needs to be changed from:
df_longer <- df %>%
pivot_longer(
cols = starts_with("hf"),
names_to = "hf_com",
values_to = "hf_freq",
names_prefix = "hf",
na.rm = TRUE)
to:
df_longer <- df %>%
pivot_longer(
cols = starts_with("hf"),
names_to = "hf_com",
values_to = "hf_freq",
names_prefix = "hf",
values_drop_na = TRUE)

loop through a r dataframe and pass rows as parameters to a function

I want to loop through a dataframe and pass the rows as arguments to a function to summarise the totals from a dataframe named df3.
I have tried code using a traditional for loop but there are not results.
I have looked at pmap in https://adv-r.hadley.nz/functionals.html#pmap
but the I don't see how to apply this example to my code.
Here is some data from the original data:
dput(head(df3,n=3))
structure(list(id = c("81", "83", "85"), look_work = c("yes",
"yes", "yes"), current_work = c("no", "yes", "no"), hf_l5k = c("",
"", ""), ac_l5k = c("", "", ""), hf_5_10k = c("", "1", "1"),
ac_5_10k = c("", "1", "1"), hf_11_20k = c("", "", ""), ac_11_20k = c("",
"", ""), hf_21_50k = c("", "", ""), ac_21_50k = c("", "",
""), hf_51_100k = c("", "", ""), ac_51_100k = c("", "", ""
), hf_m100k = c("", "", ""), ac_m100k = c("", "", ""), s_l1000 = c("",
"", ""), se_l1000 = c("", "", "1"), s_1001_1500 = c("", "1",
"1"), se_1001_1500 = c("", "", ""), s_2001_3000 = c("", "",
""), se_2001_3000 = c("", "1", ""), s_3001_4000 = c("", "",
""), se_3001_4000 = c("", "", ""), s_4001_5000 = c("", "",
""), se_4001_5000 = c("", "", ""), s_5001_6000 = c("", "",
""), se_5001_6000 = c("", "", ""), s_m6000 = c("", "", ""
), se_m6000 = c("", "", ""), s_n_ans = c("", "", ""), se_n_ans = c("",
"", ""), before_work = c("no", "NULL", "yes"), keen_move = c("yes",
"yes", "no"), city_size = c("village", "more than 500k inhabitants",
"more than 500k inhabitants"), gender = c("male", "female",
"female"), age = c("18 - 24 years", "18 - 24 years", "more than 50 years"
), education = c("secondary", "vocational", "secondary")), row.names = c(NA,
3L), class = "data.frame")
Here is the dataframe hf_names for the parameters:
structure(list(hf_names = c("hf_l5k", "hf_5_10k", "hf_11_20k",
"hf_21_50k", "hf_51_100k", "hf_m100k"), job = c("hf_l5k_job",
"hf_5_10k_job", "hf_11_20k_job", "hf_21_50k_job", "hf_51_100k_job",
"hf_m100k_job"), tot = c("hf_l5k_tot", "hf_5_10k_tot", "hf_11_20k_tot",
"hf_21_50k_tot", "hf_51_100k_tot", "hf_m100k_tot")), class = "data.frame", row.names = c(NA,
-6L))
Here is the code I have tried with a traditional for loop:
library(dplyr)
tot_function <- function(df, filter_tot, col_name1, col_name2) {
# filter desired columns for all jobs
filter_tot <- df %>% filter(col_name1=="1") %>%
summarise(col_name2 = n())
}
for (i in seq_along(hf_names3)) {
tot_function(df3, hf_names3$tot[i], hf_names3$hf_names[i], hf_names3$job[i])
}
The expected results would be dataframes or vectors:
hf_l5k_jobs hf_l5_10k_jobs
10 193
but nothing is generated by this code as it looks at simple functions such as trim and runif.
I don't think you need to overcomplicate this. You can take names from hf_names, subset that column from df3 and count the number of 1's in that column.
sapply(hf_names$hf_names, function(x) sum(df3[[x]] == 1))
# hf_l5k hf_5_10k hf_11_20k hf_21_50k hf_51_100k hf_m100k
# 0 2 0 0 0 0
If you prefer tidyverse you can change sapply to map.* variations
purrr::map_int(hf_names$hf_names, ~sum(df3[[.]] == 1))

Conditional means based on other columns in R with dplyr

Let's say I have the following data:
structure(list(political_spectrum = c(5L, 15L, 12L, 30L, 100L,
0L, 27L, 52L, 38L, 64L, 0L, 0L, 76L, 50L, 16L, 16L, 0L, 23L,
0L, 25L, 68L, 50L, 4L, 0L, 50L), politics_today = c("Independent",
"Strong Democrat", "Weak Democrat", "Weak Democrat", "Weak Republican",
"Strong Democrat", "Weak Democrat", "Weak Democrat", "Independent",
"Weak Democrat", "Strong Democrat", "Independent", "Weak Republican",
"Weak Democrat", "Weak Democrat", "Strong Democrat", "Strong Democrat",
"Strong Democrat", "Strong Democrat", "Strong Democrat", "Independent",
"Independent", "Strong Democrat", "Strong Democrat", "Independent"
), stranger_things_universe_mc = c("The Demagorgon", "", "",
"", "", "", "", "", "", "The Stranger Land", "The Demagorgon",
"The Upside Down", "", "", "", "", "", "The Upside Down", "The Shadowland",
"", "", "", "", "", "The Shadowland"), stranger_things_universe_answer = c("The Upside Down",
"", "", "", "", "", "", "", "", "The Upside Down", "The Upside Down",
"The Upside Down", "", "", "", "", "", "The Upside Down", "The Upside Down",
"", "", "", "", "", "The Upside Down"), stranger_things_universe_confidence = c(32L,
NA, NA, NA, NA, NA, NA, NA, NA, 67L, 94L, 89L, NA, NA, NA, NA,
NA, 51L, 10L, NA, NA, NA, NA, NA, 0L), stranger_things_universe_importance = c("Don't care at all",
"", "", "", "", "", "", "", "", "Care somewhat strongly", "Care a little",
"Care somewhat strongly", "", "", "", "", "", "Care somewhat",
"Don't care at all", "", "", "", "", "", "Don't care at all"),
tupac_mc = c("", "Biggie Smalls", "", "", "", "", "", "Biggie Smalls",
"Biggie Smalls", "", "", "Biggie Smalls", "", "", "", "",
"", "", "Biggie Smalls", "", "", "Ice Cube", "", "", ""),
tupac_answer = c("", "Biggie Smalls", "", "", "", "", "",
"Biggie Smalls", "Biggie Smalls", "", "", "Biggie Smalls",
"", "", "", "", "", "", "Biggie Smalls", "", "", "Biggie Smalls",
"", "", ""), tupac_confidence = c(NA, 70L, NA, NA, NA, NA,
NA, 71L, 76L, NA, NA, 100L, NA, NA, NA, NA, NA, NA, 100L,
NA, NA, 32L, NA, NA, NA), tupac_importance = c("", "Don't care at all",
"", "", "", "", "", "Care somewhat", "Don't care at all",
"", "", "Care strongly", "", "", "", "", "", "", "Care a little",
"", "", "Don't care at all", "", "", ""), uber_ceo_mc = c("John Zimmer",
"", "", "", "", "Travis Kalanick", "", "", "", "Travis Kalanick",
"", "", "", "", "", "", "", "John Zimmer", "Travis Kalanick",
"Travis Kalanick", "", "", "", "", ""), uber_ceo_answer = c("Travis Kalanick",
"", "", "", "", "Travis Kalanick", "", "", "", "Travis Kalanick",
"", "", "", "", "", "", "", "Travis Kalanick", "Travis Kalanick",
"Travis Kalanick", "", "", "", "", ""), uber_ceo_confidence = c(0L,
NA, NA, NA, NA, 94L, NA, NA, NA, 69L, NA, NA, NA, NA, NA,
NA, NA, 5L, 13L, 17L, NA, NA, NA, NA, NA), uber_ceo_importance = c("Don't care at all",
"", "", "", "", "Care strongly", "", "", "", "Care somewhat",
"", "", "", "", "", "", "", "Don't care at all", "Don't care at all",
"Care somewhat", "", "", "", "", ""), black_panther_mc = c("",
"T'Chaka", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "T'Chaka", "", ""), black_panther_answer = c("",
"T'Challa", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "T'Challa", "", ""), black_panther_confidence = c(NA,
63L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 34L, NA, NA), black_panther_importance = c("",
"Don't care at all", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "Care a little",
"", ""), the_office_mc = c("The Mindy Project", "", "", "",
"", "", "", "", "", "", "", "", "", "", "The Office", "",
"", "The Mindy Project", "", "", "", "", "The Office", "",
""), the_office_answer = c("The Office", "", "", "", "",
"", "", "", "", "", "", "", "", "", "The Office", "", "",
"The Office", "", "", "", "", "The Office", "", ""), the_office_confidence = c(43L,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2L, NA,
NA, 11L, NA, NA, NA, NA, 100L, NA, NA), the_office_importance = c("Don't care at all",
"", "", "", "", "", "", "", "", "", "", "", "", "", "Don't care at all",
"", "", "Care a little", "", "", "", "", "Care a little",
"", ""), arms_manufacturing_company_mc = c("J. Brockton & Sons",
"", "", "O.F. Mossberg & Sons", "", "", "", "", "", "", "",
"", "J. Brockton & Sons", "", "", "", "", "", "", "", "",
"", "", "", "J. Brockton & Sons"), arms_manufacturing_company_answer = c("J. Brockton & Sons",
"", "", "J. Brockton & Sons", "", "", "", "", "", "", "",
"", "J. Brockton & Sons", "", "", "", "", "", "", "", "",
"", "", "", "J. Brockton & Sons"), arms_manufacturing_company_confidence = c(91L,
NA, NA, 24L, NA, NA, NA, NA, NA, NA, NA, NA, 37L, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 100L), arms_manufacturing_company_importance = c("Don't care at all",
"", "", "Don't care at all", "", "", "", "", "", "", "",
"", "Don't care at all", "", "", "", "", "", "", "", "",
"", "", "", "Don't care at all")), class = c("data.table",
"data.frame"), row.names = c(NA, -25L))
I'm trying to do something like the following:
test %>%
gather(name, value, -c('political_spectrum', 'politics_today')) %>%
filter(value != "") %>%
mutate(question_id = sub("_[^_]+$", "", name)) %>%
mutate(confidence = grepl("_confidence", name)) %>%
group_by(politics_today, question_id) %>%
summarize(mean_confidence = mean(value[confidence == "TRUE"]))
in which I get the mean_confidence values for each political affiliation, but only for specific rows in the "value" column. In order to run the mean only on "confidence" columns, I am trying to do a filter via mean(value[confidence == "TRUE"]), but am not sure the correct way to do this.
I think you need to change your code to
library(tidyverse)
test %>%
gather(name, value, -c('political_spectrum', 'politics_today')) %>%
filter(value != "") %>%
mutate(question_id = sub("_[^_]+$", "", name),
confidence = grepl("_confidence", name)) %>%
group_by(politics_today, question_id) %>%
summarize(mean_confidence = mean(as.numeric(value[confidence])))
# politics_today question_id mean_confidence
# <chr> <chr> <dbl>
# 1 Independent arms_manufacturing_company 95.5
# 2 Independent stranger_things_universe 40.3
# 3 Independent the_office 43
# 4 Independent tupac 69.3
# 5 Independent uber_ceo 0
# 6 Strong Democrat black_panther 48.5
# 7 Strong Democrat stranger_things_universe 51.7
# 8 Strong Democrat the_office 55.5
# 9 Strong Democrat tupac 85
#10 Strong Democrat uber_ceo 32.2
#11 Weak Democrat arms_manufacturing_company 24
#12 Weak Democrat stranger_things_universe 67
#13 Weak Democrat the_office 2
#14 Weak Democrat tupac 71
#15 Weak Democrat uber_ceo 69
#16 Weak Republican arms_manufacturing_company 37
Since your value column has got both numeric and character values, it gets converted to a character column so you need to change the value where confidence == TRUE to numeric.

Resources