I have two tables with information that I would like to join with a testcase as key. I could first join them, then rename the columns and then re-order the dataframe, but is there a more elegant way?
df1 <- data.frame(
testcase = c('testcase1', 'testcase2', 'testcase3', 'testcase4', 'testcase5'),
passed = c('2', '0', '2', '0', '0'),
failed = c('0', '2', '2', '0', '2'))
df2 <- data.frame(
id = c(1:10), testid = c('testcase3', 'testcase1', 'testcase3', 'testcase2', 'testcase5', 'testcase1',
'testcase3', 'testcase5', 'testcase2', 'testcase3'), total_passed = rep("", 10), total_failed= rep("", 10), testid = c(510:519), total_items = rep("", 10))
My solution would be the following, but could it be done with less steps?
df3 <- merge(df2, df1, by.x='testid', by.y='testcase')
df3$total_passed <- df3$total_failed <- NULL
df3$total_items <- 10
df3 <- select(df3, id, testid, total_passed = passed, total_failed= failed, testid, total_items)
Maybe you can take help of dplyr library :
library(dplyr)
df2 %>%
inner_join(df1, by = c('testid' = 'testcase')) %>%
transmute(id, testid, total_passed = passed, total_failed = failed,
total_items = 10)
# id testid total_passed total_failed total_items
#1 1 testcase3 2 2 10
#2 2 testcase1 2 0 10
#3 3 testcase3 2 2 10
#4 4 testcase2 0 2 10
#5 5 testcase5 0 2 10
#6 6 testcase1 2 0 10
#7 7 testcase3 2 2 10
#8 8 testcase5 0 2 10
#9 9 testcase2 0 2 10
#10 10 testcase3 2 2 10
We can use a join in data.table
library(data.table)
setDT(df2)[df1, c('total_passed', 'total_failed', 'total_items')
:= .(passed, failed, 10), on = .(testid = testcase)]
Related
This question is probably best illustrated with an example.
Suppose I have a dataframe df with a binary variable b (values of b are 0 or 1). How can I take a random sample of size 10 from this dataframe so that I have 2 instances where b=0 in the random sample, and 8 instances where b=1 in the dataframe?
Right now, I know that I can do df[sample(nrow(df),10,] to get part of the answer, but that would give me a random amount of 0 and 1 instances. How can I specify a specific amount of 0 and 1 instances while still taking a random sample?
Here's an example of how I'd do this... take two samples and combine them. I've written a simple function so you can "just take one sample."
With a vector:
pop <- sample(c(0,1), 100, replace = TRUE)
yoursample <- function(pop, n_zero, n_one){
c(sample(pop[pop == 0], n_zero),
sample(pop[pop == 1], n_one))
}
yoursample(pop, n_zero = 2, n_one = 8)
[1] 0 0 1 1 1 1 1 1 1 1
Or, if you are working with a dataframe with some unique index called id:
# Where d1 is your data you are summarizing with mean and sd
dat <- data.frame(
id = 1:100,
val = sample(c(0,1), 100, replace = TRUE),
d1 = runif(100))
yoursample <- function(dat, n_zero, n_one){
c(sample(dat[dat$val == 0,"id"], n_zero),
sample(dat[dat$val == 1,"id"], n_one))
}
sample_ids <- yoursample(dat, n_zero = 2, n_one = 8)
sample_ids
mean(dat[dat$id %in% sample_ids,"d1"])
sd(dat[dat$id %in% sample_ids,"d1"])
Here is a suggestion:
First create a sample of 0 and 1 with id column.
Then sample 2:8 df's with condition and bind them together:
library(tidyverse)
set.seed(123)
df <- as_tibble(sample(0:1,size=50,replace=TRUE)) %>%
mutate(id = row_number())
df1 <- df[ sample(which (df$value ==0) ,2), ]
df2 <- df[ sample(which (df$value ==1), 8), ]
df_final <- bind_rows(df1, df2)
value id
<int> <int>
1 0 14
2 0 36
3 1 21
4 1 24
5 1 2
6 1 50
7 1 49
8 1 41
9 1 28
10 1 33
library(tidyverse)
set.seed(123)
df <- data.frame(a = letters,
b = sample(c(0,1),26,T))
bind_rows(
df %>%
filter(b == 0) %>%
sample_n(2),
df %>%
filter(b == 1) %>%
sample_n(8)
) %>%
arrange(a)
a b
1 d 1
2 g 1
3 h 1
4 l 1
5 m 1
6 o 1
7 p 0
8 q 1
9 s 0
10 v 1
I have a df where one variable is an integer. I'd like to split this column into it's individual digits. See my example below
Group Number
A 456
B 3
C 18
To
Group Number Digit1 Digit2 Digit3
A 456 4 5 6
B 3 3 NA NA
C 18 1 8 NA
We can use read.fwf from base R. Find the max number of character (nchar) in 'Number' column (mx). Read the 'Number' column after converting to character (as.character), specify the 'widths' as 1 by replicating 1 with mx and assign the output to new 'Digit' columns in the data
mx <- max(nchar(df1$Number))
df1[paste0("Digit", seq_len(mx))] <- read.fwf(textConnection(
as.character(df1$Number)), widths = rep(1, mx))
-output
df1
# Group Number Digit1 Digit2 Digit3
#1 A 456 4 5 6
#2 B 3 3 NA NA
#3 C 18 1 8 NA
data
df1 <- structure(list(Group = c("A", "B", "C"), Number = c(456L, 3L,
18L)), class = "data.frame", row.names = c(NA, -3L))
Another base R option (I think #akrun's approach using read.fwf is much simpler)
cbind(
df,
with(
df,
type.convert(
`colnames<-`(do.call(
rbind,
lapply(
strsplit(as.character(Number), ""),
`length<-`, max(nchar(Number))
)
), paste0("Digit", seq(max(nchar(Number))))),
as.is = TRUE
)
)
)
which gives
Group Number Digit1 Digit2 Digit3
1 A 456 4 5 6
2 B 3 3 NA NA
3 C 18 1 8 NA
Using splitstackshape::cSplit
splitstackshape::cSplit(df, 'Number', sep = '', stripWhite = FALSE, drop = FALSE)
# Group Number Number_1 Number_2 Number_3
#1: A 456 4 5 6
#2: B 3 3 NA NA
#3: C 18 1 8 NA
Updated
I realized I could use max function for counting characters limit in each row so that I could include it in my map2 function and save some lines of codes thanks to an accident that led to an inspiration by dear #ThomasIsCoding.
library(dplyr)
library(tidyr)
library(purrr)
library(stringr)
df %>%
rowwise() %>%
mutate(map2_dfc(Number, 1:max(nchar(Number)), ~ str_sub(.x, .y, .y))) %>%
unnest(cols = !c(Group, Number)) %>%
rename_with(~ str_replace(., "\\.\\.\\.", "Digit"), .cols = !c(Group, Number)) %>%
mutate(across(!c(Group, Number), as.numeric, na.rm = TRUE))
# A tibble: 3 x 5
Group Number Digit1 Digit2 Digit3
<chr> <dbl> <dbl> <dbl> <dbl>
1 A 456 4 5 6
2 B 3 3 NA NA
3 C 18 1 8 NA
Data
df <- tribble(
~Group, ~Number,
"A", 456,
"B", 3,
"C", 18
)
Two base r methods:
no_cols <- max(nchar(as.character(df1$Number)))
# Using `strsplit()`:
cbind(df1, setNames(data.frame(do.call(rbind,
lapply(strsplit(as.character(df1$Number), ""),
function(x) {
length(x) <- no_cols
x
}
)
)
), paste0("Digit", seq_len(no_cols))))
# Using `regmatches()` and `gregexpr()`:
cbind(df1, setNames(data.frame(do.call(rbind,
lapply(regmatches(df1$Number, gregexpr("\\d", df1$Number)),
function(x) {
length(x) <- no_cols
x
}
)
)
), paste0("Digit", seq_len(no_cols))))
I often have two data frames that I wish to join, where I expect all values to join. If not all values are present in both data frames, I want it to return an error.
Here is a MWE:
library(dplyr, warn.conflicts = FALSE)
df1 <- data.frame(
id = c(1:5),
value1 = rep(1, 5)
)
print(df1)
#> id value1
#> 1 1 1
#> 2 2 1
#> 3 3 1
#> 4 4 1
#> 5 5 1
df2 <- data.frame(
id = c(1:4),
value2 = rep(2, 4)
)
print(df2)
#> id value2
#> 1 1 2
#> 2 2 2
#> 3 3 2
#> 4 4 2
df3 <- inner_join(
df1,
df2,
by = "id")
print(df3)
#> id value1 value2
#> 1 1 1 2
#> 2 2 1 2
#> 3 3 1 2
#> 4 4 1 2
# Check if all values have joined
stopifnot(
nrow(df3) == max(nrow(df1), nrow(df2))
)
#> Error: nrow(df3) == max(nrow(df1), nrow(df2)) is not TRUE
Created on 2021-03-31 by the reprex package (v1.0.0)
This works, but I do not like the stopifnot(). It feels cumbersome, and particularly if I wish to overwrite df2, then I need to create a temp value df2_previous_row_num = nrow(df2) and then do stopifnot(nrow(df2) == df2_previous_row_num).
Also the nrow() test only works if all values in id are unique. There are other methods, e.g. stopifnot(c(df1$id %in% df3$id, df2$id %in% df3$id)) but again these are ugly.
Really what I am looking for is a parameter that makes the join fail if some values do not join. Something like, inner_join(df1, df2, fail_if_not_all_present = TRUE).
I am not attached to the tidyverse - if there is a base R or data.table way of doing this then I would consider those.
Does anyone know anything?
You can try writing a custom inner join function.
custom_inner_join <- function(data1,data2,by, fail_if_not_all_present = FALSE) {
if(fail_if_not_all_present) {
vals1 <- do.call(paste, data1[cols])
vals2 <- do.call(paste, data2[cols])
if(all(vals1 %in% vals2) && all(vals2 %in% vals1)) {
merge(data1, data2, by)
} else stop('Not all key values are present')
} else {
merge(data1, data2, by)
}
}
custom_inner_join(df1, df2, 'id')
# id value1 value2
#1 1 1 2
#2 2 1 2
#3 3 1 2
#4 4 1 2
custom_inner_join(df1, df2, 'id', fail_if_not_all_present = TRUE)
Error in custom_inner_join(df1, df2, "id", fail_if_not_all_present = TRUE) :
Not all key values are present
I have a large database (90,000 * 1500) sorted by child observations - which includes their mom's info. I want to sort the database according to mom's data.
The problem is that each kid only appears once in DB mom bs. It may appear up to 10 times.
In addition, I want the number of rows to be a number of different mothers (approx. 40,000) and a bit of data for each child - between 0-10.
For example, the DB I have and the DB I want to create:
You could use reshape
library(data.table)
df = data.frame(
'c' = c('c1', 'c2', 'c3', 'c4', 'c5'),
'id_num' = seq(1,5),
'age' = c(12, 15, 5, 8, 19),
'mom'= c(1,3,1,2,3)
)
df
c id_num age mom
1 c1 1 12 1
2 c2 2 15 3
3 c3 3 5 1
4 c4 4 8 2
5 c5 5 19 3
df = setDT(df)[order(mom)]
df[, id_child := seq(.N), mom]
reshape(df, idvar = "mom", timevar = "id_child", direction = "wide")
mom c.1 id_num.1 age.1 c.2 id_num.2 age.2
1: 1 c1 1 12 c3 3 5
2: 2 c4 4 8 <NA> NA NA
3: 3 c2 2 15 c5 5 19
Here is a solution similar to #Metariat, but with base R, where ave() is used
df$seq <- with(df,ave(id_num,mom,FUN = seq_along))
dfout <- reshape(df, idvar = "mom", timevar = "seq", direction = "wide")
such that
> dfout
mom c.1 id_num.1 age.1 c.2 id_num.2 age.2
1 1 c1 1 12 c3 3 5
2 3 c2 2 15 c5 5 19
4 2 c4 4 8 <NA> NA NA
EDIT:
If you have very big data frame, you can try the divide and conquer policy to see if it works
library(plyr)
dfs <- split(df,df$mom)
lst <- lapply(dfs, function(x) {
x <- within(x,seqnum <- ave(id_num,mom,FUN = seq_along))
reshape(x, idvar = "mom", timevar = "seqnum", direction = "wide")
}
)
dfout <- rbind.fill(lst)
You can do this using the tidyr package, with group_by.
group_by(data, mom)
Then each mom contains a list of children. You can then sort the database as follows.
arrange(data, id_num, .by_group = TRUE)
To filter children between 0 and 10:
filter(data, age <= 10)
I have a data set like this....
ID Brand
--- --------
1 Cokacola
2 Pepsi
3 merge with 1
4 merge with 2
5 merge with 1
6 Fanta
And I want to write a R function which merge the rows and introduce new variable according to ID just like following...
ID Brand merge
---- -------- --------
1 Cokacola 1,3,5
2 Pepsi 2,4
6 Fanta 6
Your data:
dat <- data.frame(
id = 1:6,
brand = c('Cokacola', 'Pepsi', 'merge with 1', 'merge with 2', 'merge with 1', 'Fanta'))
Inelegant-but-functional code:
repeats <- grepl('^merge with', dat$brand)
groups <- ifelse(repeats, gsub('merge with ', '', dat$brand), dat$id)
merge <- sapply(unique(groups), function(x) paste(dat$id[groups==x], collapse=','))
dat <- dat[!repeats,]
dat$merge <- merge
dat
## id brand merge
## 1 1 Cokacola 1,3,5
## 2 2 Pepsi 2,4
## 6 6 Fanta 6
There are most certainly ways to make this more elegant, depending on the consistency and makeup of the data.
You could try
library(reshape2)
indx <- !grepl('merge', df$Brand)
df1 <- df[indx,]
val <- as.numeric(sub('[^0-9]+', '', df[!indx, 'Brand']))
ml <- melt(tapply(which(!indx), val, FUN=toString))
df2 <- merge(df1, ml, by.x='ID', by.y='Var1', all=TRUE)
df2$merge <- with(df2, ifelse(!is.na(value),
paste(ID, value, sep=', '), ID))
df2[-3]
# ID Brand merge
#1 1 Cokacola 1, 3, 5
#2 2 Pepsi 2, 4
#3 6 Fanta 6