Generate column id - r

I am working with log data; trying to find the round number of each event. The start of a round is signaled by action=="start". I want to create a "action.round" columns that tells me which round each event corresponds to.
I have data such this:
data <- read_table2("Id action
A start
A na
A start
A na
A na
A na
A na
A start
B start
B na
B start
B na
B start
B na"
I am trying to create an output such as this:
output <- read_table2("Id action action.round
A start 1
A na 1
A start 2
A na 2
A na 2
A na 2
A na 2
A start 3
B start 1
B na 1
B start 2
B na 2
B start 3
B na 3")
So far, I have been able to get part of the output by using row_number(), like this:
` data %>%
mutate(round.start=case_when(actionValue=="start"~"start",TRUE~"NA")) %>%
ungroup() %>%
group_by(Id,round.start) %>%
mutate(action.round=row_number())`
But now, I would like to fill the round number that corresponds to round.start=="start" into the column, so that I know which round number each column actually corresponds to (see desired output above).

You could use cumsum after grouping by Id.
library(dplyr)
data %>% group_by(Id) %>% mutate(action.round = cumsum(action == "start"))
# Id action action.round
# <chr> <chr> <int>
# 1 A start 1
# 2 A na 1
# 3 A start 2
# 4 A na 2
# 5 A na 2
# 6 A na 2
# 7 A na 2
# 8 A start 3
# 9 B start 1
#10 B na 1
#11 B start 2
#12 B na 2
#13 B start 3
#14 B na 3
This can be done in base R
data$action.round <- with(data, ave(action == "start", Id, FUN = cumsum))
and data.table as well
library(data.table)
setDT(data)[, action.round := cumsum(action == "start"), Id]
data
data <- structure(list(Id = c("A", "A", "A", "A", "A", "A", "A", "A",
"B", "B", "B", "B", "B", "B"), action = c("start", "na", "start",
"na", "na", "na", "na", "start", "start", "na", "start", "na",
"start", "na")), row.names = c(NA, -14L), spec = structure(list(
cols = list(Id = structure(list(), class = c("collector_character",
"collector")), action = structure(list(), class = c("collector_character",
"collector")), action.round = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))

Related

Compare and identify the missing rows

I would like to compare per row 2 df based on serial and day variables and to create a new column called compare to highlight the missing rows. How can this be done in R? I tried the inner_join function without success.
Sample structure df1 and df2
Desired output:
Sample data
df1<-structure(list(serial = c(1, 2, 3, 4, 5), day = c(1, 0, 1, 0,
0)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-5L), spec = structure(list(cols = list(serial = structure(list(), class = c("collector_double",
"collector")), day = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
df2<-structure(list(serial = c(1, 2, 3, 4, 5, 5, 7), day = c(1, 0,
1, 0, 0, 1, 1)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -7L), spec = structure(list(cols = list(
serial = structure(list(), class = c("collector_double",
"collector")), day = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
We can use tidyverse
library(dplyr)
df2 %>%
mutate(compare = TRUE) %>%
left_join(df1 %>%
mutate(compare1 = TRUE), by = c('serial', 'day')) %>%
transmute(serial, day, compare = (!is.na(compare1)))
-output
# A tibble: 7 x 3
serial day compare
<dbl> <dbl> <lgl>
1 1 1 TRUE
2 2 0 TRUE
3 3 1 TRUE
4 4 0 TRUE
5 5 0 TRUE
6 5 1 FALSE
7 7 1 FALSE
Or with a faster and efficient data.table
library(data.table)
setDT(df2)[, compare := FALSE][setDT(df1), compare := TRUE, on = .(serial, day)]
One way would be to create a unique key combining the two columns and use %in% to find if the key is present in another dataset.
A base R option -
df2$compare <- do.call(paste, df2) %in% do.call(paste, df1)
df2
# A tibble: 7 x 3
# serial day compare
# <dbl> <dbl> <lgl>
#1 1 1 TRUE
#2 2 0 TRUE
#3 3 1 TRUE
#4 4 0 TRUE
#5 5 0 TRUE
#6 5 1 FALSE
#7 7 1 FALSE
If there are more columns in your data apart from serial and day use the below code.
cols <- c('serial', 'day')
df2$compare <- do.call(paste, df2[cols]) %in% do.call(paste, df1[cols])
A base R option
transform(
merge(cbind(df1, compare = TRUE), df2, all = TRUE),
compare = !is.na(compare)
)
gives
serial day compare
1 1 1 TRUE
2 2 0 TRUE
3 3 1 TRUE
4 4 0 TRUE
5 5 0 TRUE
6 5 1 FALSE
7 7 1 FALSE

Convert all coumns ending in 'ID' to character in tidyverse

I have numerous dataframes with many columns where the name of the column ends in "ID". What's the simplest way to change the type of every column ending in "ID". Ideally I'd pass the imported dataframe to a function which would return the same dataframe but with the column types changed. I definitely can't hardcode the column names as I will not know in advance what the columns are.
Here's some sample data:
test_data <- structure(list(ContactID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
), SystemID = c(3, 1, 5, 4, 3, 5, 35, 1, 55, 52, 9), Value1 = c("A",
"B", "C", "D", "E", "F", "E", "G", "D", "S", "C"), Value2 = c("1/01/2020",
"2/01/2020", "3/01/2020", "4/01/2020", "5/01/2020", "6/01/2020",
"7/01/2020", "8/01/2020", "9/01/2020", "10/01/2020", "11/01/2020"
), OtherID = c(10004, 10009, 10002, 10007, 10099, 10010, 10002,
10004, 10002, 10007, 10099)), class = c("spec_tbl_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -11L), spec = structure(list(
cols = list(ContactID = structure(list(), class = c("collector_double",
"collector")), SystemID = structure(list(), class = c("collector_double",
"collector")), Value1 = structure(list(), class = c("collector_character",
"collector")), Value2 = structure(list(), class = c("collector_character",
"collector")), OtherID = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
Columns ContactID, SystemID and OtherID have been imported from a CSV file (using read_csv from readr) and so have been designated numeric. I want a function where I can pass this (or any other dataframe) to change any columns ending in ID to character.
I've tried this but it seems very clumsy. Looking for a neater solution.
change_ID_cols <- function(x) {
id_cols <- grep("ID$", colnames(x))
for (i in id_cols) {
for (j in 1:nrow(x)) {
x[j,i] <- as.character(x[j,i])
}
}
x
}
Does this work:
library(dplyr)
test_data %>% mutate(across(ends_with('ID'), as.character))
# A tibble: 11 x 5
ContactID SystemID Value1 Value2 OtherID
<chr> <chr> <chr> <chr> <chr>
1 1 3 A 1/01/2020 10004
2 2 1 B 2/01/2020 10009
3 3 5 C 3/01/2020 10002
4 4 4 D 4/01/2020 10007
5 5 3 E 5/01/2020 10099
6 6 5 F 6/01/2020 10010
7 7 35 E 7/01/2020 10002
8 8 1 G 8/01/2020 10004
9 9 55 D 9/01/2020 10002
10 10 52 S 10/01/2020 10007
11 11 9 C 11/01/2020 10099
>
You don't have to change each value individually to character. You can turn the complete column into character at once. To do this for multiple columns use lapply.
change_ID_cols <- function(x) {
id_cols <- grep("ID$", colnames(x))
x[id_cols] <- lapply(x[id_cols], as.character)
x
}
An option with data.table would be
library(data.table)
nm <- grep('ID$', names(test_data), value = TRUE)
setDT(test_data)[, (nm) := lapply(.SD, as.character), .SDcols = nm]

R - apply function on two files in folders with for loop or lapply and save results in one dataframe

I have a data set in "data" with 20 folders, which are identical in their structure. The only difference at the level of the folders are their names (from "1" to "20"). Please see the pattern below. The files have always the same file name and the same column structure. There might be a difference in the column length in the .csv files between folders, but not between the .csv files in the same folder. There are no missing values in the data frames. I want to work with the columns "mean" from the files.
Data structure
data
- 1 (folder)
- alpha (file)
- mean (column)
- .... (more columns)
- beta (file)
- mean (column)
- .... (more columns)
- ... (more files)
- 2 (folder)
- alpha (file)
- mean (column)
- .... (more columns)
- beta (file)
- mean (column)
- .... (more columns)
- ... (more files)
- ... (more folders with the same structure)
I would like to compare the mean from alpha to the mean from beta in one folder. In the end however, I would like to have one dataframe which is subsetted of all the results of all individual folders. So I can create faceted boxplots and descriptive statistics out of this dataframe.
I am still new to R and apparently lack the skills for it (also sorry for the complicated code and my English). I can manually perform the task for one folder each, but I can not put the findings together with a for loop or lapply solution.
I have found many threads where data frames need to be merged without prior executing of a function from two files in the same folder. I do hope I produced a workable minimal example with 2 data frames each from 2 folders.
library(plyr)
library(tidyverse)
alpha1 <- read_csv('data/1/alpha.csv')
beta1 <- read_csv('data/1/beta.csv')
alpha2 <- read_csv('data/2/alpha2.csv')
beta2 <- read_csv('data/2/beta2.csv')
Folder 1
alpha1 <- structure(list(Name = c("A", "B", "C", "D", "E", "F", "G", "H",
"I", "J", "K"), mean = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)), class = c("spec_tbl_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -11L), spec = structure(list(
cols = list(Name = structure(list(), class = c("collector_character",
"collector")), mean = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
beta1 <- structure(list(Name = c("A", "B", "C", "D", "E", "F", "G", "H",
"I", "J", "K"), mean = c(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)), class = c("spec_tbl_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -11L), spec = structure(list(
cols = list(Name = structure(list(), class = c("collector_character",
"collector")), mean = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
alpha_mean <- alpha1 %>% select(mean_alpha = mean)
alphabeta <- alpha_mean %>% add_column(mean_beta = beta1$mean)
alphabeta_table <- ddply(alphabeta, .(), transform, alphabeta = (mean_alpha/mean_beta))
alphabeta_table
.id mean_alpha mean_beta alphabeta
1 <NA> 1 2 0.5000000
2 <NA> 2 3 0.6666667
3 <NA> 3 4 0.7500000
4 <NA> 4 5 0.8000000
5 <NA> 5 6 0.8333333
6 <NA> 6 7 0.8571429
7 <NA> 7 8 0.8750000
8 <NA> 8 9 0.8888889
9 <NA> 9 10 0.9000000
10 <NA> 10 11 0.9090909
11 <NA> 11 12 0.9166667
Folder 2
alpha2 <- structure(list(Name = c("A", "B", "C", "D", "E", "F", "G", "H",
"I", "J", "K", "L", "M"), mean = c(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -13L), spec = structure(list(
cols = list(Name = structure(list(), class = c("collector_character",
"collector")), mean = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
beta2 <- structure(list(Name = c("A", "B", "C", "D", "E", "F", "G", "H",
"I", "J", "K", "L", "M"), mean = c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -13L), spec = structure(list(
cols = list(Name = structure(list(), class = c("collector_character",
"collector")), mean = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
alpha2_mean <- alpha2 %>% select(mean_alpha = mean)
alphabeta2 <- alpha2_mean %>% add_column(mean_beta = beta2$mean)
alphabeta2_table <- ddply(alphabeta2, .(), transform, alphabeta = (mean_alpha/ mean_beta))
alphabeta2_table
.id mean_alpha mean_beta alphabeta
1 <NA> 2 3 0.6666667
2 <NA> 3 4 0.7500000
3 <NA> 4 5 0.8000000
4 <NA> 5 6 0.8333333
5 <NA> 6 7 0.8571429
6 <NA> 7 8 0.8750000
7 <NA> 8 9 0.8888889
8 <NA> 9 10 0.9000000
9 <NA> 10 11 0.9090909
10 <NA> 11 12 0.9166667
11 <NA> 12 13 0.9230769
12 <NA> 13 14 0.9285714
13 <NA> 14 15 0.9333333
Desired output
My desired output would be:
.id mean_alpha mean_beta alphabeta
1 1 1 2 0.5000000
2 1 2 3 0.6666667
3 1 3 4 0.7500000
4 1 4 5 0.8000000
5 1 5 6 0.8333333
6 1 6 7 0.8571429
7 1 7 8 0.8750000
8 1 8 9 0.8888889
9 1 9 10 0.9000000
10 1 10 11 0.9090909
11 1 11 12 0.9166667
1 2 2 3 0.6666667
2 2 3 4 0.7500000
3 2 4 5 0.8000000
4 2 5 6 0.8333333
5 2 6 7 0.8571429
6 2 7 8 0.8750000
7 2 8 9 0.8888889
8 2 9 10 0.9000000
9 2 10 11 0.9090909
10 2 11 12 0.9166667
11 2 12 13 0.9230769
12 2 13 14 0.9285714
13 2 14 15 0.9333333
1 3 ... ... ...
2 3 ... ... ...
...
Thank you for any help!
Try this solution :
Get all the folders using list.dirs.
For each folder read the "alpha" and "beta" files and return a 3 column tibble back with alpha, beta and alphabeta values.
Bind all the dataframes with and id column to know from which folder each value is coming.
all_folders <- list.dirs('Data/', recursive = FALSE, full.names = TRUE)
result <- purrr::map_df(all_folders, function(x) {
all_Files <- list.files(x, full.names = TRUE, pattern = 'alpha|beta')
df1 <- read.csv(all_Files[1])
df2 <- read.csv(all_Files[2])
tibble::tibble(alpha = df1$mean, beta = df2$mean, alphabeta = alpha/beta)
}, .id = "id")

Merging data frame and filling missing values [duplicate]

This question already has answers here:
Merging a lot of data.frames [duplicate]
(1 answer)
How do I replace NA values with zeros in an R dataframe?
(29 answers)
Closed 2 years ago.
I want to merge the following 3 data frames and fill the missing values with -1. I think I should use the fct merge() but not exactly know how to do it.
> df1
Letter Values1
1 A 1
2 B 2
3 C 3
> df2
Letter Values2
1 A 0
2 C 5
3 D 9
> df3
Letter Values3
1 A -1
2 D 5
3 B -1
desire output would be:
Letter Values1 Values2 Values3
1 A 1 0 -1
2 B 2 -1 -1 # fill missing values with -1
3 C 3 5 -1
4 D -1 9 5
code:
> dput(df1)
structure(list(Letter = structure(1:3, .Label = c("A", "B", "C"
), class = "factor"), Values1 = c(1, 2, 3)), class = "data.frame", row.names = c(NA,
-3L))
> dput(df2)
structure(list(Letter = structure(1:3, .Label = c("A", "C", "D"
), class = "factor"), Values2 = c(0, 5, 9)), class = "data.frame", row.names = c(NA,
-3L))
> dput(df3)
structure(list(Letter = structure(c(1L, 3L, 2L), .Label = c("A",
"B", "D"), class = "factor"), Values3 = c(-1, 5, -1)), class = "data.frame", row.names = c(NA,
-3L))
You can get data frames in a list and use merge with Reduce. Missing values in the new dataframe can be replaced with -1.
new_df <- Reduce(function(x, y) merge(x, y, all = TRUE), list(df1, df2, df3))
new_df[is.na(new_df)] <- -1
new_df
# Letter Values1 Values2 Values3
#1 A 1 0 -1
#2 B 2 -1 -1
#3 C 3 5 -1
#4 D -1 9 5
A tidyverse way with the same logic :
library(dplyr)
library(purrr)
list(df1, df2, df3) %>%
reduce(full_join) %>%
mutate(across(everything(), replace_na, -1))
Here's a dplyr solution
df1 %>%
full_join(df2, by = "Letter") %>%
full_join(df3, by = "Letter") %>%
mutate_if(is.numeric, function(x) replace_na(x, -1))
output:
Letter Values1 Values2 Values3
<chr> <dbl> <dbl> <dbl>
1 A 1 0 -1
2 B 2 -1 -1
3 C 3 5 -1
4 D -1 9 5

Aggregate by group AND add column to data frame in R [duplicate]

This question already has answers here:
Calculate group mean, sum, or other summary stats. and assign column to original data
(4 answers)
Closed 4 years ago.
For a sample dataframe:
df1 <- structure(list(place = c("a", "a", "b", "b", "b", "b", "c", "c",
"c", "d", "d"), animal = c("cat", "bear", "cat", "bear", "pig",
"goat", "cat", "bear", "goat", "goat", "bear"), number = c(5,
6, 7, 4, 5, 6, 8, 5, 3, 7, 4)), .Names = c("place", "animal",
"number"), row.names = c(NA, -11L), spec = structure(list(cols = structure(list(
place = structure(list(), class = c("collector_character",
"collector")), animal = structure(list(), class = c("collector_character",
"collector")), number = structure(list(), class = c("collector_integer",
"collector"))), .Names = c("place", "animal", "number")),
default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"), class = c("tbl_df",
"tbl", "data.frame"))
I want to create a variable 'sum' which sums the 'number' column by 'place' (regardless of animal), and adds it to the datafame.
The command below:
df1$sum <- aggregate(df1$number, by=list(Category=df1$place), FUN=sum)
... tries to do the sum but can't complete the function because it wants to report by only the number of individual places (hence why we get this error):
Error in `$<-.data.frame`(`*tmp*`, sum, value = list(Category = c("a", :
replacement has 4 rows, data has 11
Any ideas how I add this extra column onto my dataframe?
Since you have a tibble, first a dplyr solution. Next a base R version.
using dplyr:
df1 %>%
group_by(place) %>%
mutate(sum_num = sum(number))
# A tibble: 11 x 4
# Groups: place [4]
place animal number sum_num
<chr> <chr> <dbl> <dbl>
1 a cat 5 11
2 a bear 6 11
3 b cat 7 22
4 b bear 4 22
5 b pig 5 22
6 b goat 6 22
7 c cat 8 16
8 c bear 5 16
9 c goat 3 16
10 d goat 7 11
11 d bear 4 11
using base R:
df1$sum_num <- ave(df1$number, df1$place, FUN = sum)
# A tibble: 11 x 4
place animal number sum_num
<chr> <chr> <dbl> <dbl>
1 a cat 5 11
2 a bear 6 11
3 b cat 7 22
4 b bear 4 22
5 b pig 5 22
6 b goat 6 22
7 c cat 8 16
8 c bear 5 16
9 c goat 3 16
10 d goat 7 11
11 d bear 4 11

Resources