R select certain values from multiple columns using conditions - r

I want to select certain values from multiple columns using conditions.(also let assign row 1 as ID#1, ... row5 as ID#5)
column1 <- c("rice 2", "apple 4", "melon 6", "blueberry 4", "orange 6")
column2 <- c("rice 8", "blueberry 8", "grape 10", "water 10", "mango 3")
column3 <- c("rice 6", "apple 8", "blueberry 12", "pineapple 8", "mango 3")
I want to get new column using IDs with condition only rice > 5, blueberry > 7 or orange > 5
First, I would like to get ID#1, ID#2, ID#3, ID#5
Second, I would to count how many conditions met per ID
I would like to get results
ID#1 -> 2 conditions met
ID#2 -> 1 conditions met
ID#3 -> 1 conditions met
ID#4 -> 0 conditions met
ID#5 -> 1 conditions met

If I understood the question correctly then one of the approach could be
library(dplyr)
cols <- names(df)[-1]
df1 <- df %>%
mutate_if(is.factor, as.character) %>%
mutate(rice_gt_5 = (select(., one_of(cols)) %>%
rowwise() %>%
mutate_all(funs(strsplit(., split=" ")[[1]][1] =='rice' & as.numeric(strsplit(., split=" ")[[1]][2]) > 5)) %>%
rowSums)) %>%
mutate(blueberry_gt_7 = (select(., one_of(cols)) %>%
rowwise() %>%
mutate_all(funs(strsplit(., split=" ")[[1]][1] =='blueberry' & as.numeric(strsplit(., split=" ")[[1]][2]) > 7)) %>%
rowSums)) %>%
mutate(orange_gt_5 = (select(., one_of(cols)) %>%
rowwise() %>%
mutate_all(funs(strsplit(., split=" ")[[1]][1] =='orange' & as.numeric(strsplit(., split=" ")[[1]][2]) > 5)) %>%
rowSums))
#IDs which satisfy at least one of your conditions i.e. rice > 5 OR blueberry > 7 OR orange > 5
df1$ID[which(df1 %>% select(rice_gt_5, blueberry_gt_7, orange_gt_5) %>% rowSums() >0)]
#[1] 1 2 3 5
#How many conditions are met per ID
df1 %>%
mutate(no_of_cond_met = rowSums(select(., one_of(c("rice_gt_5", "blueberry_gt_7", "orange_gt_5"))))) %>%
select(ID, no_of_cond_met)
# ID no_of_cond_met
#1 1 2
#2 2 1
#3 3 1
#4 4 0
#5 5 1
Sample data:
df <- structure(list(ID = 1:5, column1 = structure(c(5L, 1L, 3L, 2L,
4L), .Label = c("apple 4", "blueberry 4", "melon 6", "orange 6",
"rice 2"), class = "factor"), column2 = structure(c(4L, 1L, 2L,
5L, 3L), .Label = c("blueberry 8", "grape 10", "mango 3", "rice 8",
"water 10"), class = "factor"), column3 = structure(c(5L, 1L,
2L, 4L, 3L), .Label = c("apple 8", "blueberry 12", "mango 3",
"pineapple 8", "rice 6"), class = "factor")), .Names = c("ID",
"column1", "column2", "column3"), row.names = c(NA, -5L), class = "data.frame")

Related

Complex Long-Wide Dataset to Long Dataset in R

I have a complex dataset that looks like this:
df1 <- tibble::tribble(~"Canada > London", ~"", ~"Notes", ~"United Kingdom > London", ~"", ~"",
"Restaurant", "Price", "Range", "Restaurant", "Price", "Range",
"Fried beef", "27", "25-30", "Fried beef", "29", "25 - 35",
"Fried potato", "5", "3 - 8", "Fried potato", "8", "3 - 8",
"Bar", "Price", "Range", "Price", "Range", "",
"Beer Lager", "5", "4 - 8", "Beer Lager", "6", "4 - 8",
"Beer Dark", "4", "3 - 7", "Beer Dark", "5", "3 - 7")
Or, for visual representation:
It is long in parameters (like Beer Lager, Beer Dark, ....) and wide by the data input (many wide elements like Canada > London, or United Kingdom > London).
The desired output would be two datasets that should look like this:
The first dataset (the Values):
The second dataset (the Ranges):
Any suggestions would be much appreciated :)
Your data is neither wide nor long but is a messy data table which needs some cleaning to convert it to tidy data. Afterwards you could get your desired tables using tidyr::pivot_wider:
library(dplyr)
library(tidyr)
library(purrr)
tidy_data <- function(.data, cols) {
.data <- .data[cols]
place <- names(.data)[[1]]
.data |>
rename(product = 1, price = 2, range = 3) |>
filter(!price %in% c("Price", "Range")) |>
mutate(place = place)
}
df1_tidy <- purrr::map_dfr(list(1:3, 4:6), tidy_data, .data = df1)
df1_tidy |>
select(place, product, price) |>
pivot_wider(names_from = product, values_from = price)
#> # A tibble: 2 × 5
#> place `Fried beef` `Fried potato` `Beer Lager` `Beer Dark`
#> <chr> <chr> <chr> <chr> <chr>
#> 1 Canada > London 27 5 5 4
#> 2 United Kingdom > London 29 8 6 5
df1_tidy |>
select(place, product, range) |>
pivot_wider(names_from = product, values_from = range, names_glue = "{product} Range")
#> # A tibble: 2 × 5
#> place `Fried beef Range` Fried potato Rang…¹ Beer …² Beer …³
#> <chr> <chr> <chr> <chr> <chr>
#> 1 Canada > London 25-30 3 - 8 4 - 8 3 - 7
#> 2 United Kingdom > London 25 - 35 3 - 8 4 - 8 3 - 7
#> # … with abbreviated variable names ¹​`Fried potato Range`, ²​`Beer Lager Range`,
#> # ³​`Beer Dark Range`
I agree with #stefan. You actually have 4 tables, or 2 depending on how you look at it. Here is an implementation of 2 functions that start the cleaning and formatting process. The first split the dfs by row and the second function splits them by column. After that it is easier to format, clean, and merge the dfs into 1.
library(tidyverse)
df0 = tibble::tribble(~"Canada > London", ~"", ~"Notes", ~"United Kingdom > London", ~"", ~"",
"Restaurant", "Price", "Range", "Restaurant", "Price", "Range",
"Fried beef", "27", "25-30", "Fried beef", "29", "25 - 35",
"Fried potato", "5", "3 - 8", "Fried potato", "8", "3 - 8",
"Bar", "Price", "Range", "Price", "Range", "",
"Beer Lager", "5", "4 - 8", "Beer Lager", "6", "4 - 8",
"Beer Dark", "4", "3 - 7", "Beer Dark", "5", "3 - 7")
split_rows = function(df){
# breaks of sub-dfs within original df
df_breaks = df[,2] == "Price"
df_breaks = (1:length(df_breaks))[df_breaks]
df_breaks
# list to populate in loop with sub-dfs
df_list = c()
for(i in 1:length(df_breaks)){
# get start of sub-df
start = df_breaks[i]
# get end of sub-df
if(i == length(df_breaks)){
end = nrow(df) # if its the last set it to the last row of the original df
}
else{
end = df_breaks[i+1]-1 # else, set it to the next start - 1
}
# subset df
df_temp = df[start:end,]
# first row as header
colnames(df_temp) = df_temp[1,]
df_temp = df_temp[-1,]
# append to df_list
df_list = append(df_list,list(df_temp))
}
return(df_list)
}
split_cols = function(df_list,second_df_col_start = 4){
df_list = lapply(df_list, function(df){
df1 = df[,1:(second_df_col_start-1)]
df2 = df[,second_df_col_start:ncol(df)]
return(list(df1,df2))
})
return(df_list)
}
output = split_rows(df0) %>%
split_cols()
output:
[[1]]
[[1]][[1]]
# A tibble: 2 × 3
Restaurant Price Range
<chr> <chr> <chr>
1 Fried beef 27 25-30
2 Fried potato 5 3 - 8
[[1]][[2]]
# A tibble: 2 × 3
Restaurant Price Range
<chr> <chr> <chr>
1 Fried beef 29 25 - 35
2 Fried potato 8 3 - 8
[[2]]
[[2]][[1]]
# A tibble: 2 × 3
Bar Price Range
<chr> <chr> <chr>
1 Beer Lager 5 4 - 8
2 Beer Dark 4 3 - 7
[[2]][[2]]
# A tibble: 2 × 3
Price Range ``
<chr> <chr> <chr>
1 Beer Lager 6 4 - 8
2 Beer Dark 5 3 - 7

how to change one col value using the information from other cols in the same df

I have a data.farme that looks like this:
I want to generate a new df as codebook where the numbers in col Label will be replaced using the information from ID and Subject.
what should I do?
The codebook file that I want to achieve is sth that looks like this:
Sample data can be build using codes:
df<-structure(list(Var = c("Subject1", "Subject2", "Subject4", "Subject5",
"Subject6", "Score1", "Score2", "Score3", "Score4", "Score5",
"Score6", "TestDate1", "TestDate2", "TestDate3", "TestDate4",
"TestDate5", "TestDate6"), Label = c("Subject 1", "Subject 2",
"Subject 4", "Subject 5", "Subject 6", "Score for Subject 1",
"Score for Subject 2", "Score for Subject 3", "Score for Subject 4",
"Score for Subject 5", "Score for Subject 6", "Date for test Subject 1",
"Date for test Subject 2", "Date for test Subject 3", "Date for test Subject 4",
"Date for test Subject 5", "Date for test Subject 6"), ID = c(1,
2, 3, 4, 5, 6, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Subject = c("Math",
"ELA", "PE", "Art", "Physic", "Chemistry", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA)), row.names = c(NA, -17L), class = c("tbl_df",
"tbl", "data.frame"))
We can use str_replace_all with a named vector
library(dplyr)
library(stringr)
df1 <- df %>%
transmute(Var, Label = str_replace_all(Label,
setNames(na.omit(Subject), na.omit(ID))))
-output
df1
# A tibble: 17 x 2
# Var Label
# <chr> <chr>
# 1 Subject1 Subject Math
# 2 Subject2 Subject ELA
# 3 Subject4 Subject Art
# 4 Subject5 Subject Physic
# 5 Subject6 Subject Chemistry
# 6 Score1 Score for Subject Math
# 7 Score2 Score for Subject ELA
# 8 Score3 Score for Subject PE
# 9 Score4 Score for Subject Art
#10 Score5 Score for Subject Physic
#11 Score6 Score for Subject Chemistry
#12 TestDate1 Date for test Subject Math
#13 TestDate2 Date for test Subject ELA
#14 TestDate3 Date for test Subject PE
#15 TestDate4 Date for test Subject Art
#16 TestDate5 Date for test Subject Physic
#17 TestDate6 Date for test Subject Chemistry
or using gsubfn
library(gsubfn)
df$Label <- with(df, gsubfn("(\\d+)",
setNames(as.list(na.omit(Subject)), na.omit(ID)), Label))

create dataframe from list of lists of data.frames

I have a list of lists of data.frames, which I would like to convert to a data.frame. The structure is as follows:
l_of_lists <- list(
year1 = list(
one = data.frame(date = c("Jan-10", "Jan-22"), type = c("type 1", "type 2")),
two = data.frame(date = c("Feb-1", "Feb-28"), type = c("type 2", "type 3")),
three = data.frame(date = c("Mar-10", "Mar-15"), type = c("type 1", "type 4"))
),
year2 = list( # dates is used here on purpose, as the names don't perfectly match
one = data.frame(dates = c("Jan-22"), type = c("type 2"), another_col = c("entry 2")),
two = data.frame(date = c("Feb-10", "Feb-18"), type = c("type 2", "type 3"), another_col = c("entry 2", "entry 3")),
three = data.frame(date = c("Mar-10", "Mar-15"), type = c("type 1", "type 4"), another_col = c("entry 4", "entry 5"))
),
year3 = list( # this deliberately only contains two data frames
one = data.frame(date = c("Jan-10", "Jan-12"), type = c("type 1", "type 2")),
two = data.frame(date = c("Feb-8", "Jan-28"), type = c("type 2", "type 3"))
))
The data frame has two particularities I tried to mimic above:
the column names differ by 1-2 characters (e.g. date vs. dates)
some columns are only present in some data frames (e.g. another_col)
I now would like to convert this to a data frame (I tried different calls to rbind and also do.call, as described e.g. here unsuccessfully) and would like to
- match on column names tolerantly (if the column names are similar to 1-2 characters, I want them to be matched and
- fill non-existent columns with NA in other columns.
I want a data frame similar to the following
year level date type another_col
1 one "Jan-10" "type 1" NA
1 one "Jan-22" "type 2" NA
1 two "Feb-1" "type 2" NA
1 two "Feb-28" "type 3" NA
1 three "Mar-10" "type 1" NA
1 three "Mar-15" "type 4" NA
2 one "Jan-22" "type 2" "entry 2"
2 two "Feb-1" "type 2" "entry 2"
2 two "Feb-28" "type 3" "entry 3"
2 three "Mar-10" "type 1" "entry 4"
2 three "Mar-15" "type 4" "entry 5"
3 one "Jan-10" "type 1" NA
3 one "Jan-12" "type 2" NA
3 two "Feb-8" "type 2" NA
3 two "Feb-28" "type 3" NA
Can someone point out if rbind is the correct path here - and what I am missing?
You could do something like the following using purrr and dplyr:
l_of_lists <- list(
year1 = list(
one = data.frame(date = c("Jan-10", "Jan-22"), type = c("type 1", "type 2")),
two = data.frame(date = c("Feb-1", "Feb-28"), type = c("type 2", "type 3")),
three = data.frame(date = c("Mar-10", "Mar-15"), type = c("type 1", "type 4"))
),
year2 = list( # dates is used here on purpose, as the names don't perfectly match
one = data.frame(dates = c("Jan-22"), type = c("type 2"), another_col = c("entry 2")),
two = data.frame(date = c("Feb-10", "Feb-18"), type = c("type 2", "type 3"), another_col = c("entry 2", "entry 3")),
three = data.frame(date = c("Mar-10", "Mar-15"), type = c("type 1", "type 4"), another_col = c("entry 4", "entry 5"))
),
year3 = list( # this deliberately only contains two data frames
one = data.frame(date = c("Jan-10", "Jan-12"), type = c("type 1", "type 2")),
two = data.frame(date = c("Feb-8", "Jan-28"), type = c("type 2", "type 3"))
))
# add libraries
library(dplyr)
library(purrr)
# Map bind_rows to each list within the list
l_of_lists %>%
map_dfr(~bind_rows(.x, .id = "level"), .id = "year")
This will yield:
year level date type dates another_col
1 year1 one Jan-10 type 1 <NA> <NA>
2 year1 one Jan-22 type 2 <NA> <NA>
3 year1 two Feb-1 type 2 <NA> <NA>
4 year1 two Feb-28 type 3 <NA> <NA>
5 year1 three Mar-10 type 1 <NA> <NA>
6 year1 three Mar-15 type 4 <NA> <NA>
7 year2 one <NA> type 2 Jan-22 entry 2
8 year2 two Feb-10 type 2 <NA> entry 2
9 year2 two Feb-18 type 3 <NA> entry 3
10 year2 three Mar-10 type 1 <NA> entry 4
11 year2 three Mar-15 type 4 <NA> entry 5
12 year3 one Jan-10 type 1 <NA> <NA>
13 year3 one Jan-12 type 2 <NA> <NA>
14 year3 two Feb-8 type 2 <NA> <NA>
15 year3 two Jan-28 type 3 <NA> <NA>
Then of course you can do some regex parsing to keep only the numeric year:
l_of_lists %>%
map_dfr(~bind_rows(.x, .id = "level"), .id = "year") %>%
mutate(year = substring(year, regexpr("\\d", year)))
If you know that date and dates are the same, you can always use mutate to changed then to those values that are not missing (i.e.mutate(date = ifelse(!is.na(date), date, dates)))

Counting the number of occurrences of a combination of values in r

I'm working with data concerning different cases going through a proces consisting of different fases during a certain period in time. Each case has an unique id number. A proces can start in multiple fases and ends in fase "Finished" (except for still ungoing fases). A case can go through a proces multiple times.
The data looks similar to this:
library(dplyr)
df1 <- structure(list(id = c("1", "1", "2", "2", "2", "2", "3", "3",
"3", "3", "3", "3", "3", "3", "3", "3"), time = structure(c(17453,
17458, 17453, 17462, 17727, 17735, 17453, 17484, 17568, 17665,
17665, 17709, 17727, 17727, 17757, 17819), class = "Date"), old_fase =
c(NA, "Fase 1", NA, "Fase 1", "Finished", "Fase 1", NA, "Fase 1", "Fase 2A",
"Finished", "Fase 2A", "Fase 2B", "Finished", "Fase 2B", "Fase 1",
"Fase 2A"), new_fase = c("Fase 1", "Finished", "Fase 1", "Finished",
"Fase 1", "Finished", "Fase 1", "Fase 2A", "Finished", "Fase 2A",
"Fase 2B", "Finished", "Fase 2B", "Fase 1", "Fase 2A", "Fase 2B"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -16L))
For my analysis I want to create a new id based on the occurrence of each proces per id. Using group_by and mutate on "id" and "new_fase" creates the following incorrect solution. This happens because of the first occurrence of "Fase 2B" in row 11.
df1 %>%
group_by(id,new_fase) %>%
mutate(occurrence=row_number())
The correct solution should look like this:
df1 %>%
mutate(occurrence = c(rep(1, 4), 2, 2, rep(1, 3), rep(2, 3), rep(3, 4)))
I tried multiple approaches and read multiple Stackoverflow posts, but I am not able to figure it out correctly. Any help is appreciated, preferably using a tidyverse solution.
We can use ave from base R
df2$occurrence <- with(df2, ave(seq_along(id), id, fase, FUN = seq_along))
Or with data.table
library(data.table)
setDT(df2)[, occurrence := seq_len(.N), .(id, fase)]
df3<- df1 %>%
group_by(id,fase) %>%
mutate(occurrence=row_number())
df3
# A tibble: 18 x 4
# Groups: id, fase [9]
id fase time occurrence
<dbl> <chr> <date> <int>
1 1 a 2018-01-01 1
2 1 b 2018-01-02 1
3 1 c 2018-01-03 1
4 2 a 2018-01-01 1
5 2 b 2018-01-02 1
6 2 c 2018-01-03 1
7 2 a 2018-01-04 2
8 2 b 2018-01-05 2
9 2 c 2018-01-06 2
10 2 a 2018-01-07 3
11 2 b 2018-01-08 3
12 2 c 2018-01-09 3
13 3 a 2018-01-01 1
14 3 b 2018-01-02 1
15 3 c 2018-01-03 1
16 3 a 2018-01-04 2
17 3 b 2018-01-05 2
18 3 c 2018-01-06 2
all(df2==df3)
[1] TRUE
You break down (group) the df into parts where each part has the same id and phase, and then you simply number the rows in each of these parts. Note this assumes the df is already sorted chronologically, as in your sample data. If this is not true, you'll have to sort it in advance by time.
I found this temporary solution (thanks to iod's solution on the first example using group_by and mutate).
df1 %>% filter(is.na(old_fase) | old_fase == "Finished") %>% # indicates the beginning of a new proces
group_by(id) %>%
mutate(occurrence = row_number()) %>%
select(id, time, occurrence) %>%
left_join(df1, ., by = c("id", "time")) %>%
fill(occurrence)

selecting a subset of data based on another column

I have a dataset which looks something like this:
Area Num
[1,] "Area 1" "99"
[2,] "Area 3" "85"
[3,] "Area 1" "60"
[4,] "Area 2" "90"
[5,] "Area 1" "40"
[6,] "Area 3" NA
[7,] "Area 4" "10"
...
code:
structure(c("Area 1", "Area 3", "Area 1", "Area 2", "Area 1",
"Area 3", "Area 4", "99", "85", "60", "90", "40", NA, "10"), .Dim = c(7L,
2L), .Dimnames = list(NULL, c("Area", "Num")))
I need to do some calculation on values in Num for each Area, for example calculating the sum of each Area, or the summary of each Area.
I'm thinking of using a nested for loop to achieve this, but I'm not sure how to.
You can do this using aggregate, but the dplyr package makes it very easy to work with such problems. There are plenty of duplicates of this question, though.
library(dplyr)
df <- structure(c("Area 1", "Area 3", "Area 1", "Area 2", "Area 1",
"Area 3", "Area 4", "99", "85", "60", "90", "40", NA, "10"), .Dim = c(7L,
2L), .Dimnames = list(NULL, c("Area", "Num")))
df <- data.frame(df)
df$Num <- as.numeric(df$Num)
df2 <- df %>%
group_by(Area) %>%
summarise(totalNum = sum(Num, na.rm=T))
df2
In order to apply the function to every level of the factor, we can recurse to the by function:
dt <- structure(c("Area 1", "Area 3", "Area 1", "Area 2", "Area 1",
"Area 3", "Area 4", "99", "85", "60", "90", "40", NA, "10"), .Dim = c(7L, 2L), .Dimnames = list(NULL, c("Area", "Num")))
dt <- data.frame(dt)
dt$Num <- as.numeric(dt$Num)
t <- by(dt$Num, dt$Area, sum)
t
Doing the same thing using data.table
library(data.table)
dt <- data.table(df)
dt[,sum(as.numeric(Num),na.rm=T),by=Area]
## Area V1
## 1: Area 1 199
## 2: Area 3 85
## 3: Area 2 90
## 4: Area 4 10

Resources