Repeating loop and adding columns in R - r

I am trying to build an R code that will take my loop and run it 20 times. Each time I would like to add a column to the existing data frame. Here I tried it by adding the code 3 times, but I feel like there must be an easier way to automate this. I am very grateful for any help.
My original data file (called "igel") contains two columns ("Year" and "Grid") and 1096 rows. With the loop I pick a random number from the column "Grid" and check whether it has been picked before. If so it adds 0 to a new column if not it adds 1.
Here the code:
a <- data.frame(matrix(ncol = 2, nrow = 0))
x <- c("number", "count")
colnames(a) <- x
for (i in 1:1096) {
num_i <- sample(igel$Grid, 1)
count_i <- c(if (num_i %in% a$number == TRUE) {0} else {1})
a<-a %>% add_row(number = num_i, count = count_i)
}
b <- data.frame(matrix(ncol = 2, nrow = 0))
x <- c("number", "count")
colnames(b) <- x
for (i in 1:1096) {
num_i <- sample(igel$Grid, 1)
count_i <- c(if (num_i %in% b$number == TRUE) {0} else {1})
b<-b %>% add_row(number = num_i, count = count_i)
}
c <- data.frame(matrix(ncol = 2, nrow = 0))
x <- c("number", "count")
colnames(c) <- x
for (i in 1:1096) {
num_i <- sample(igel$Grid, 1)
count_i <- c(if (num_i %in% c$number == TRUE) {0} else {1})
c<-c %>% add_row(number = num_i, count = count_i)
}
df.total<- cbind(a$count,b$count, c$count)

Consider sapply and even its wrapper, replicate and calculate number and count separately in vector calculations instead of growing object in loop by row.
# RUNS 3 SAMPLES OF igel$Grid 1,096 TIMES (ADJUST 3 TO ANY POSITIVE INT LIKE 20)
grid_number <- data.frame(replicate(3, replicate(1096, sample(igel$Grid, 1))))
# RUNS ACROSS 3 COLUMNS TO CHECK CURRENT ROW VALUE IS INCLUDED FOR ALL VALUES BEFORE ROW
grid_count <- sapply(grid_number, function(col)
sapply(seq_along(col), function(i)
ifelse(col[i] %in% col[1:(i-1)], 0, 1)
)
)
While above does not exactly reproduce your output, df.total (a matrix and not data frame), due to the random sampling within iterations, the two maintain similar structure:
dim(df.total)
# [1] 1096 3
dim(grid_count)
# [1] 1096 3

Try to avoid iterating through rows. It is rarely necessary, if ever. Here is one approach (replace n with 1096 and elem with igel$Grid):
n = 20
elem = 1:5
df.total = list()
for (i in 1:5) {
a = data.frame(number = sample(elem, n, replace=TRUE))
a$count = as.numeric(duplicated(a$number))
df.total[[i]] = a
}
df.total = as.data.frame(df.total)
df.total
## number count number.1 count.1 number.2 count.2 number.3 count.3 number.4 count.4
## 1 4 0 2 0 5 0 4 0 1 0
## 2 3 0 5 0 3 0 4 1 3 0
## 3 5 0 3 0 4 0 2 0 4 0
## 4 5 1 1 0 2 0 5 0 3 1
## 5 2 0 4 0 2 1 5 1 5 0
## 6 4 1 2 1 2 1 5 1 5 1
## 7 5 1 1 1 3 1 2 1 4 1
## 8 5 1 2 1 5 1 5 1 4 1
## 9 2 1 1 1 1 0 1 0 1 1
## 10 3 1 1 1 5 1 4 1 1 1
## 11 5 1 3 1 1 1 3 0 5 1
## 12 2 1 1 1 2 1 5 1 1 1
## 13 3 1 5 1 4 1 5 1 4 1
## 14 1 0 4 1 2 1 4 1 1 1
## 15 4 1 4 1 2 1 5 1 1 1
## 16 4 1 2 1 5 1 2 1 5 1
## 17 3 1 1 1 1 1 3 1 2 0
## 18 2 1 2 1 2 1 2 1 2 1
## 19 2 1 3 1 1 1 2 1 1 1
## 20 1 1 3 1 2 1 1 1 3 1

Related

Create a new dataframe with 1's and 0's from summarized data?

I have the below dataset that I am working with in R:
df <- data.frame(day=seq(1,3,1), tot.infected=c(1,2,4), tot.ind=5)
df
And I would like to transform the tot.infected column into a binomial variable with 1's and 0's, such as the following dataframe:
df2 <- data.frame(year = c(rep(1,5), rep(2,5), rep(3,5)), infected = c(rep(1,1), rep(0,4), rep(1,2), rep(0,3), rep(1,4), rep(0,1)))
Is there a more elegant way to do this in R?
Thank you for your help!
I tried hard-coding a dataframe using rep(), but this is extremely time-consuming for large datasets and I was looking for a more elegant way to achieve this.
base R
tmp <- do.call(Map, c(list(f = function(y, inf, ind) data.frame(year = y, infected = replace(integer(ind), seq(ind) <= inf, 1L))), unname(df)))
do.call(rbind, tmp)
# year infected
# 1 1 1
# 2 1 0
# 3 1 0
# 4 1 0
# 5 1 0
# 6 2 1
# 7 2 1
# 8 2 0
# 9 2 0
# 10 2 0
# 11 3 1
# 12 3 1
# 13 3 1
# 14 3 1
# 15 3 0
dplyr
library(dplyr)
df %>%
rowwise() %>%
summarize(tibble(year = day, infected = replace(integer(tot.ind), seq(tot.ind) <= tot.infected, 1L)))
# # A tibble: 15 x 2
# year infected
# <dbl> <int>
# 1 1 1
# 2 1 0
# 3 1 0
# 4 1 0
# 5 1 0
# 6 2 1
# 7 2 1
# 8 2 0
# 9 2 0
# 10 2 0
# 11 3 1
# 12 3 1
# 13 3 1
# 14 3 1
# 15 3 0
We can do it this way:
library(dplyr)
df %>%
group_by(day) %>%
summarise(cur_data()[seq(unique(tot.ind)),]) %>%
#mutate(x = row_number())
mutate(tot.infected = ifelse(row_number() <= first(tot.infected),
first(tot.infected)/first(tot.infected), 0), .keep="used")
day tot.infected
<dbl> <dbl>
1 1 1
2 1 0
3 1 0
4 1 0
5 1 0
6 2 1
7 2 1
8 2 0
9 2 0
10 2 0
11 3 1
12 3 1
13 3 1
14 3 1
15 3 0
Using rep.int and replace, basically.
with(df, data.frame(
year=do.call(rep.int, unname(df[c(1, 3)])),
infected=unlist(Map(replace, Map(rep.int, 0, tot.ind), lapply(tot.infected, seq), 1))
))
# year infected
# 1 1 1
# 2 1 0
# 3 1 0
# 4 1 0
# 5 1 0
# 6 2 1
# 7 2 1
# 8 2 0
# 9 2 0
# 10 2 0
# 11 3 1
# 12 3 1
# 13 3 1
# 14 3 1
# 15 3 0
Data:
df <- structure(list(day = c(1, 2, 3), tot.infected = c(1, 2, 4), tot.ind = c(5,
5, 5)), class = "data.frame", row.names = c(NA, -3L))

Creating list of all pairwise comparisons within data frame in R

From a data frame in R that has X Y coordinates (see example) I would like to add to rows (final X and final Y) to show all possible pairwise comparisons between the two.
dt = data.frame(X = seq(1, 5, by=1), Y = seq(1, 5, by=1))
This is the final goal but there should be a row for every possible combination of x, y and final_x, final_y
You can use expand.grid:
eg <- expand.grid(final_Y = 1:5, Y = 1:5, final_X = 1:5, X = 1:5)[,c(4,2,3,1)]
head(eg, n=20)
# X Y final_X final_Y
# 1 1 1 1 1
# 2 1 1 1 2
# 3 1 1 1 3
# 4 1 1 1 4
# 5 1 1 1 5
# 6 1 2 1 1
# 7 1 2 1 2
# 8 1 2 1 3
# 9 1 2 1 4
# 10 1 2 1 5
# 11 1 3 1 1
# 12 1 3 1 2
# 13 1 3 1 3
# 14 1 3 1 4
# 15 1 3 1 5
# 16 1 4 1 1
# 17 1 4 1 2
# 18 1 4 1 3
# 19 1 4 1 4
# 20 1 4 1 5
nrow(eg)
# [1] 625
I defined the columns out of order and reordered them simply to match the ordering of your expected output. One could easily do expand.grid(X=,Y=,final_X=,final_Y=) and leave off the [,c(...)] and the effective results would be the same but in a different row-order.

Looping through specified columns in a Matrix and replacing their values by subtracting the value from 4

I am new(ish) to R and I am still unsure about loops.
If I had a large matrix object in R with columns having values of 0 - 4, and I would like to invert these values for specified columns.
I would use the code:
b[, "AX1"] <- 4 - b[, "AX1"]
Where b is a Matrix extracted from a larger list object and AX1 would be a column in the matrix.
I would then replace the changed Matrix back into its list using the code:
DF1$geno[[1]]$data <- b
How would I loop this code through a list of column names(AX1, AX10, AX42, ...)for about 30 columns of the 5000 columns in the matrix if I used a list with the 30 Column names to be inverted?
The simplest way you can do it (assuming that you always transform it the way x = 4 - x) is to expand your approach to the list of columns:
# Create an example dataset
set.seed(68859457)
(
dat <- matrix(
data = sample(x = 0:4, size = 100, replace = TRUE),
nrow = 10,
dimnames = list(1:10, paste('AX', 1:10, sep = ''))
)
)
# AX1 AX2 AX3 AX4 AX5 AX6 AX7 AX8 AX9 AX10
# 1 2 1 2 3 2 2 3 1 0 4
# 2 4 3 4 4 0 1 3 1 3 4
# 3 3 0 3 4 2 2 4 1 2 1
# 4 2 2 0 2 4 2 2 1 1 0
# 5 4 4 4 3 3 1 0 3 2 2
# 6 2 1 1 0 3 3 4 4 1 0
# 7 2 3 1 3 3 1 0 1 0 4
# 8 2 2 1 1 0 3 1 3 2 1
# 9 3 1 4 1 2 1 0 0 4 1
# 10 4 3 2 4 1 0 2 0 3 2
# Create a list of columns you want to modify
set.seed(68859457)
(
cols_to_invert <- sort(sample(x = colnames(dat), size = 5))
)
# [1] "AX3" "AX4" "AX5" "AX6" "AX9"
# Use the list of columns created above to modify matrix in place
dat[, cols_to_invert] <- 4 - dat[, cols_to_invert]
# See the result
dat
# AX1 AX2 AX3 AX4 AX5 AX6 AX7 AX8 AX9 AX10
# 1 2 1 2 1 2 2 3 1 4 4
# 2 4 3 0 0 4 3 3 1 1 4
# 3 3 0 1 0 2 2 4 1 2 1
# 4 2 2 4 2 0 2 2 1 3 0
# 5 4 4 0 1 1 3 0 3 2 2
# 6 2 1 3 4 1 1 4 4 3 0
# 7 2 3 3 1 1 3 0 1 4 4
# 8 2 2 3 3 4 1 1 3 2 1
# 9 3 1 0 3 2 3 0 0 0 1
# 10 4 3 2 0 3 4 2 0 1 2
Difficult to tell without knowing exact structure of the data but based on your explanation and attempt maybe this will help.
cols <- c('AX1', 'AX10', 'AX42')
DF1$geno <- lapply(DF1$geno, function(x) {
x$data <- 4 - x$data[, cols]
x
})

multiple sampling in R [duplicate]

This question already has answers here:
How to split data into training/testing sets using sample function
(28 answers)
Randomly sample data frame into 3 groups in R
(4 answers)
Closed 3 years ago.
I'd like to know how to make multiple sampling in R. For example, when I try dividing some data into 60(train data):40(validate data), I can write the code like this:
original.data = read.csv("~.csv", na.strings="")
train.index = sample(c(1:dim(original.data)[1]), dim(original.data)[1]*0.6)
train.data = original.data[train.index,]
valid.data = original.data[-train.index,]
However, it is so hard to figure out making multiple sampling like dividing some data into 60:20:20.
I would appreciate if you make me know the best solution!
If you want more than two sets, then the other solutions are close but you need just a little more. There are at least two options.
First:
set.seed(2)
table(samp <- sample(1:3, size = nrow(iris), prob = c(0.6, 0.2, 0.2), replace = TRUE))
# 1 2 3
# 93 35 22
nrow(iris) # 150
set1 <- iris[samp == 1,]
set2 <- iris[samp == 2,]
set3 <- iris[samp == 3,]
set1 <- iris[samp == 1,]
set2 <- iris[samp == 2,]
set3 <- iris[samp == 3,]
nrow(set1)
# [1] 93
nrow(set2)
# [1] 35
nrow(set3)
# [1] 22
Because it's random, you want always get your exact proportions.
Second:
If you must have exact proportions, you can do this:
ns <- nrow(iris) * c(0.6, 0.2, 0.2)
sum(ns)
# [1] 150
### in case of rounding (and sum != nrow) ... just fix one of ns
rep(1:3, times = ns)
# [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# [46] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# [91] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
# [136] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
set.seed(2)
head(samp <- sample(rep(1:3, times = ns)))
# [1] 1 2 1 1 3 3
set1 <- iris[samp == 1,]
set2 <- iris[samp == 2,]
set3 <- iris[samp == 3,]
nrow(set1)
# [1] 90
nrow(set2)
# [1] 30
nrow(set3)
# [1] 30
This can easily be generalized to support an arbitrary number of partitions.

Most frequent values in sliding window dataframe in R

I have the following dataframe (df):
A B T Required col (window = 3)
1 1 0 1
2 3 0 3
3 4 0 4
4 2 1 1 4
5 6 0 0 2
6 4 1 1 0
7 7 1 1 1
8 8 1 1 1
9 1 0 0 1
I would like to add the required column, as followed:
Insert in the current row the previous row value of A or B.
If in the last 3 (window) rows most of time the content of A column is equal to T column - choose A, otherwise - B. (There can be more columns - so the content of the column with the most times equal to T will be chosen).
What is the most efficient way to do it for big data table.
I changed the column named T to be named TC to avoid confusion with T as an abbreviation for TRUE
library(tidyverse)
library(data.table)
df[, newcol := {
equal <- A == TC
map(1:.N, ~ if(.x <= 3) NA
else if(sum(equal[.x - 1:3]) > 3/2) A[.x - 1]
else B[.x - 1])
}]
df
# N A B TC newcol
# 1: 1 1 0 1 NA
# 2: 2 3 0 3 NA
# 3: 3 4 0 4 NA
# 4: 4 2 1 1 4
# 5: 5 6 0 0 2
# 6: 6 4 1 1 0
# 7: 7 7 1 1 1
# 8: 8 8 1 1 1
# 9: 9 1 0 0 1
This works too, but it's less clear, and likely less efficient
df[, newcol := shift(A == TC, 1:3) %>%
pmap_lgl(~sum(...) > 3/2) %>%
ifelse(shift(A), shift(B))]
data:
df <- fread("
N A B TC
1 1 0 1
2 3 0 3
3 4 0 4
4 2 1 1
5 6 0 0
6 4 1 1
7 7 1 1
8 8 1 1
9 1 0 0
")
Probably much less efficient than the answer by Ryan, but without additional packages.
A<-c(1,3,4,2,6,4,7,8,1)
B<-c(0,0,0,1,0,1,1,1,0)
TC<-c(1,3,4,1,0,1,1,1,0)
req<-rep(NA,9)
df<-data.frame(A,B,TC,req)
window<-3
for(i in window:(length(req)-1)){
equal <- sum(df$A[(i-window+1):i]==df$TC[(i-window+1):i])
if(equal > window/2){
df$req[i+1]<-df$A[i]
}else{
df$req[i+1]<-df$B[i]
}
}

Resources