Copy preceding row if condition is met - r

My data
set.seed(123)
df <- data.frame(loc = rep(1:5, each = 5),value = sample(0:4, 25, replace = T))
a <- c("x","y","z","k")
df$id <- ifelse(df$value == 0, "no.data", sample(a,1))
head(df)
loc value id
1 1 1 z
2 1 3 z
3 1 2 z
4 1 4 z
5 1 4 z
6 2 0 no.data
Rows for which I have no data, the id and value columns have no.data and 0. For all rows where I have no data (id == no.data and value == 0), I want to copy the value and id from the preceding row.
loc value id
1 1 1 z
2 1 3 z
3 1 2 z
4 1 4 z
5 1 4 z
6 2 4 z
Something like:
df %>% group_by(loc) %>% mutate(value = ifelse(value == 0, copy the value from preceding row), id = ifelse(id== "no.data", copy the id from preceding row ))

We could replace the 0s by NA and then do a fill
library(tidyverse)
library(naniar)
df %>%
replace_with_na(replace = list(value = 0, id = "no.data")) %>%
fill(value, id)

Unless you have a very big dataset a simple loop should do
for (r in 2:nrow(df)) {
if (with(df[r, ], id == "no.data" && value == 0)) {
df[r, c("id", "value")] <- df[r - 1L, c("id", "value")]
}
}

Related

how to select rows from lists in r?

df1 = data.frame(
y1 = c(1, -2, 3),
y2 = c(4, 5, 6),out = c("*", "*", "")
)
Create another dataframe
df2 = data.frame(
y1 = c(7, -3, 9),
y2 = c(1, 4, 6),out = c("", "*", "")
)
Create list of data frame using list()
lis = list(df1, df2)
I want to compare row by row (first row from index [[1]] with first row from index [[2]] and so on
if both rows have * or empty (similar) under out, then select the row with the highest absolute value in y1 (and put under ind the index)
otherwise, take the y1 value of the row with * under out (and put under ind the index)
example of desired output:
y1 y2 out ind
1 4 * 1
-3 4 * 2
9 6 2
An option is to bind the dataset with bind_rows and specify .id to create an identifier for the list sequence. Then grouped by the rowid of the 'ind' column, either we arrange the rows in the group so that * values in out will be first, then do the order on absolute values of 'y1' in descending. We slice the row with the max absolute value in 'y1' if the number of distinct elements in 'out' is 1 or else return the 1st row (will be * in out if there is one)
library(dplyr)
library(data.table)
bind_rows(lis, .id = 'ind') %>%
group_by(rn = rowid(ind)) %>%
arrange(out != "*", desc(abs(y1)), .by_group = TRUE) %>%
slice(if(n_distinct(out) == 1) which.max(abs(y1)) else 1) %>%
ungroup %>%
select(y1, y2, out, ind)
-output
# A tibble: 3 × 4
y1 y2 out ind
<dbl> <dbl> <chr> <chr>
1 1 4 "*" 1
2 -3 4 "*" 2
3 9 6 "" 2
Or without arrange, after grouping, create a position index in slice ('i1') where the values in out are *, then use that index to subset the y1 values to find the position where it is absolute max in else case and return the position of 'i1' by indexing
bind_rows(lis, .id = 'ind') %>%
group_by(rn = rowid(ind)) %>%
slice({i1 <- which(out == "*")
if(n_distinct(out) == 1) which.max(abs(y1)) else
i1[which.max(abs(y1[i1]))]}) %>%
ungroup %>%
select(y1, y2, out, ind)
-output
# A tibble: 3 × 4
y1 y2 out ind
<dbl> <dbl> <chr> <chr>
1 1 4 "*" 1
2 -3 4 "*" 2
3 9 6 "" 2
Or use a similar ordering approach with data.table
library(data.table)
dt1 <- rbindlist(lis, idcol = "ind")[, rn := rowid(ind)]
dt2 <- dt1[order(rn, out != "*", desc(abs(y1)))]
dt2[dt2[, .I[if(uniqueN(out) == 1) which.max(abs(y1)) else
1L], .(rn)]$V1][, rn := NULL][]
ind y1 y2 out
1: 1 1 4 *
2: 2 -3 4 *
3: 2 9 6

Random Sample From a Dataframe With Specific Count

This question is probably best illustrated with an example.
Suppose I have a dataframe df with a binary variable b (values of b are 0 or 1). How can I take a random sample of size 10 from this dataframe so that I have 2 instances where b=0 in the random sample, and 8 instances where b=1 in the dataframe?
Right now, I know that I can do df[sample(nrow(df),10,] to get part of the answer, but that would give me a random amount of 0 and 1 instances. How can I specify a specific amount of 0 and 1 instances while still taking a random sample?
Here's an example of how I'd do this... take two samples and combine them. I've written a simple function so you can "just take one sample."
With a vector:
pop <- sample(c(0,1), 100, replace = TRUE)
yoursample <- function(pop, n_zero, n_one){
c(sample(pop[pop == 0], n_zero),
sample(pop[pop == 1], n_one))
}
yoursample(pop, n_zero = 2, n_one = 8)
[1] 0 0 1 1 1 1 1 1 1 1
Or, if you are working with a dataframe with some unique index called id:
# Where d1 is your data you are summarizing with mean and sd
dat <- data.frame(
id = 1:100,
val = sample(c(0,1), 100, replace = TRUE),
d1 = runif(100))
yoursample <- function(dat, n_zero, n_one){
c(sample(dat[dat$val == 0,"id"], n_zero),
sample(dat[dat$val == 1,"id"], n_one))
}
sample_ids <- yoursample(dat, n_zero = 2, n_one = 8)
sample_ids
mean(dat[dat$id %in% sample_ids,"d1"])
sd(dat[dat$id %in% sample_ids,"d1"])
Here is a suggestion:
First create a sample of 0 and 1 with id column.
Then sample 2:8 df's with condition and bind them together:
library(tidyverse)
set.seed(123)
df <- as_tibble(sample(0:1,size=50,replace=TRUE)) %>%
mutate(id = row_number())
df1 <- df[ sample(which (df$value ==0) ,2), ]
df2 <- df[ sample(which (df$value ==1), 8), ]
df_final <- bind_rows(df1, df2)
value id
<int> <int>
1 0 14
2 0 36
3 1 21
4 1 24
5 1 2
6 1 50
7 1 49
8 1 41
9 1 28
10 1 33
library(tidyverse)
set.seed(123)
df <- data.frame(a = letters,
b = sample(c(0,1),26,T))
bind_rows(
df %>%
filter(b == 0) %>%
sample_n(2),
df %>%
filter(b == 1) %>%
sample_n(8)
) %>%
arrange(a)
a b
1 d 1
2 g 1
3 h 1
4 l 1
5 m 1
6 o 1
7 p 0
8 q 1
9 s 0
10 v 1

Update specific values in a dataframe based on array index position

Let's say I have a dataframe
> colA <- c(1, 14, 8)
> colB <- c(4, 8, 9)
> colC <- c(1, 2, 14)
> df <- data.frame(c(colA, colB, colC))
> df
colA colB colC
1 1 4 1
2 14 8 2
3 8 9 14
What I want to do is create a second data frame which has the same structure as df, but has 1 whenever a specific number is found, and 0 otherwise, e.g., if the number were 14, df2 would look like this
> df2
colA colB colC
1 0 0 0
2 1 0 0
3 0 0 1
I thought I could create a 3x3 data frame of 0s (df2), use which() to get the index for the number in df, and then use that index to change what shows up in df2
> number <- 14
> index <- which(df == number)
> index
[1] 2 9
or perhaps more helpfully
> index <- which(df == number, arr.ind = T)
> index
row col
[1,] 2 1
[2,] 3 3
However I am unsure how to use this index to specifiy which values in the df of NAs should be TRUE and which FALSE (i.e. how to reverse the which)?
NB - I will actually be testing this for multiple numbers, so I figured I would do it inside a for loop. So I want the final DF to show ones for every location which has any of the numbers (i.e. gradually switching the 0's "on" to 1's
> numbers <- c(14, 9, 1
> for(i in numbers){
> index <- which(df == numbers, arr.ind = T)
> #then do whatever needs to be done to change the index locations in df2
P.S., in general, I work in the tidyverse, so tidyverse specific solutions would be grand, but base r would also be brilliant.
Ohh, and yes, this is for day 4 of Advent of Code - it's a useful challenge to help this non-expert coder learn.
Thanks
Here's a full example how it could be done.
Data
df <- structure(list(colA = c(1, 14, 8), colB = c(4, 8, 9), colC = c(1,
2, 14)), class = "data.frame", row.names = c(NA, -3L))
base R
data.frame( sapply( df, function(x) as.numeric( x == 14 | x == 8 ) ))
colA colB colC
1 0 0 0
2 1 1 0
3 1 0 1
for any number in a loop
setNames( data.frame( matrix( rowSums( sapply( c(14,8,1), function(x)
df==x ) ), dim(df) ) ), colnames( df ) )
colA colB colC
1 1 0 1
2 1 1 0
3 1 0 1
dplyr
library(dplyr)
df %>% summarise_all( ~ as.numeric( .x == 14 | .x == 8 ) )
colA colB colC
1 0 0 0
2 1 1 0
3 1 0 1
# or
df %>% summarise( across( everything(), ~ as.numeric( .x == 14 | .x == 8 ) ) )
colA colB colC
1 0 0 0
2 1 1 0
3 1 0 1

create dummy variable based on values of another variable?

I have a large dataset with multiple columns of the following structure
A B
1. 1. D1
2. 1. D2
3. 2 D2
4. 3. D1
5. 3. D2
I'm trying to create a new data frame based on unique observations in column A, with a dummy variable "Dummy" coded as 1=D1, 2=D2, 3=both, like so:
A. Dummy
1. 1. 3
2. 2. 2
3. 3. 3
Any idea how I can go about this?
You can use aggregate.
aggregate(B ~ A, df, function(x) if(all(x == "D1")) 1 else if(all(x == "D2")) 2 else 3)
# A B
# 1 1 3
# 2 2 2
# 3 3 3
Another possible solution:
df %>%
group_by(A) %>%
summarise(B = paste0(B, collapse = "_")) %>%
mutate(Dummy = case_when(
B == "D1" ~ 1,
B == "D2" ~ 2,
B == "D1_D2" | B == "D2_D1" ~ 3,
TRUE ~ NA_real_
)) %>%
select(-B)
Result
# A tibble: 3 x 2
A Dummy
<dbl> <dbl>
1 1 3
2 2 2
3 3 3
Here is an option with dplyr. After grouping by 'A', if the number of distinct elements are greater than 1, return 3 or else use a named vector to match the first element of 'B'
library(dplyr)
df1 %>%
group_by(A) %>%
summarise(Dummy = if(n_distinct(B) > 1) 3L else
setNames(1:2, c("D1", "D2"))[first(B)])
# A tibble: 3 x 2
# A Dummy
#* <dbl> <int>
#1 1 3
#2 2 2
#3 3 3
data
df1 <- structure(list(A = c(1, 1, 2, 3, 3), B = c("D1", "D2", "D2",
"D1", "D2")), class = "data.frame", row.names = c("1.", "2.",
"3.", "4.", "5."))

How do you exclude values when creating a string when setting up initial conditions?

I'm trying to combine columns in my data frame so that they give me a certain string. I have columns titled as "C", "H", "O", "N", and "S" as elements. Within those columns are listed the number of elements within that molecule, but I want to exclude some elements depending on their value. For example when there is no Oxygens the value is 0, so i want to exclude this when combining the elements to make a string.
#This is a portion of my data frame titled data4a
C H O N S
3 4 0 0 1
7 5 4 1 0
#The code I have is
data4a$NewComp = paste("C",data4a$Total.C,"H", data4a$NewH, "O", data4a$O, "N", data4a$N, "S", data4a$S, sep = "")
#This code gives me this
C H O N S NewComp
3 4 0 0 1 C3H4O0N0S1
7 5 4 1 0 C7H5O4N1S0
#I expect to see something like this when I print my results
C H O N S NewComp
3 4 0 0 1 C3H4S1
7 5 4 1 0 C7H5O4N
#I want values of zero to be excluded from the string created
An option is apply with argument MARGIN = 1
dat$NewComp <- apply(dat, 1, function(x) {
tmp <- unlist(x)
paste0(names(x)[tmp != 0], tmp[tmp != 0], collapse = "")
})
Result
dat
# C H O N S NewComp
#1 3 4 0 0 1 C3H4S1
#2 7 5 4 1 0 C7H5O4N1
data
dat <- structure(list(C = c(3L, 7L), H = 4:5, O = c(0L, 4L), N = 0:1,
S = c(1L, 0L)), .Names = c("C", "H", "O", "N", "S"), class = "data.frame", row.names = c(NA,
-2L))
Here is a base R solution that solves the question problem and simplifies the creation of the molecule vectors at the same time.
m <- matrix(paste0(names(data4a), t(as.matrix(data4a))),
ncol = ncol(data4a), byrow = TRUE)
m <- apply(m, 1, paste, collapse = "")
data4a$NewComp <- gsub(".0", "", m)
data4a
# C H O N S NewComp
#1 3 4 0 0 1 C3H4S1
#2 7 5 4 1 0 C7H5O4N1
Data.
data4a <- read.table(text = "
C H O N S
3 4 0 0 1
7 5 4 1 0
", header = TRUE)
Another approach could be to use which and create a new dataframe with row number column number and value of the data which is not 0. We then replace the column number with column names and then use aggregate by row number to paste formula together.
df1 <- which(df != 0, arr.ind = TRUE)
df2 <- cbind.data.frame(df1, value = df[df != 0])
df2$col <- names(df)[df2$col]
df$NewComp <- aggregate(paste0(df2$col, df2$value), list(df2$row),
paste0, collapse = "")[, 2]
df
# C H O N S NewComp
#1 3 4 0 0 1 C3H4S1
#2 7 5 4 1 0 C7H5O4N1
As it has been mentioned in comments of other answer if you have data only in selected columns use df[selected_columns] in the first statement of which.
One possibility involving tidyverse could be:
df %>%
rowid_to_column() %>%
gather(var, val, -rowid) %>%
filter(val != 0) %>%
group_by(rowid) %>%
summarise(NewComp = paste0(paste0(var, val), collapse = "")) %>%
left_join(df %>%
rowid_to_column(), by = c("rowid" = "rowid")) %>%
ungroup() %>%
select(-rowid)
NewComp C H O N S
<chr> <int> <int> <int> <int> <int>
1 C3H4S1 3 4 0 0 1
2 C7H5O4N1 7 5 4 1 0
Or:
df %>%
rowid_to_column() %>%
gather(var, val, -rowid) %>%
filter(val != 0) %>%
group_by(rowid) %>%
mutate(NewComp = paste0(paste0(var, val), collapse = "")) %>%
spread(var, val, fill = 0) %>%
ungroup() %>%
select(-rowid)
Sample data:
df <- read.table(text = "C H O N S
3 4 0 0 1
7 5 4 1 0",
header = TRUE,
stringsAsFactors = FALSE)

Resources