Random subset/sample of dataframe - r

I have a doubt about how to sample a dataframe. The dataset is like this:
with 114 rows with 9 columns. I need to extract 3 subsets, each one of 38 rows (114 / 3).
I have this script, but it doesn't works for the last subset:
install.packages("Rcmdr")
library(Rcmdr)
ana <- read.delim("~/Desktop/ana", header=TRUE, dec=",")
set1 <- ana[sample(nrow(ana), 38), ]
set1.index <- as.numeric(rownames(set1))
ana2 <- ana[(-set1.index),]
set2 <- ana2[sample(nrow(ana2), 38), ]
set2.index <- as.numeric(rownames(set2))
ana3 <- ana2[(-set2.index),]
ana3
For set1 and set2 I get the subsets correctly, but for set3 I get 50 rows (or less).
(Thank you in advance! =) )

Generally #docendodiscimus give valid advice but the sampling code he offers will not guarantee equal numbers in the subsets (see below). Try this instead:
set.seed(123) # best to set a seed to allow roproducibility
sampidx <- sample( rep(1:3, each=38)
set1 <- ana[sampidx==1, ] # logical indexing of dataframe
set2 <- ana[sampidx==2, ]
set3 <- ana[sampidx==3, ]
Lack of equipartition with sample using replacement:
> table( sample(1:3, nrow(iris), replace = TRUE) )
1 2 3
52 52 46
> table( sample(1:3, nrow(iris), replace = TRUE) )
1 2 3
51 49 50 # notice that it also varies from draw to draw
> table(sampidx)
sampidx
1 2 3
38 38 38

Related

R: Create Integer Array from Two Columns, then Plot Histogram

I have a dataframe with 2 numeric columns. For each row, I want to create an array of integers that fall between the values in the columns, and that includes the values in the column. Then, I want to compile all of the values into a single column to generate a histogram.
Input:
df
C1 C2
A 3 -92
B 8 -162
C 20 -97
D 50 -76
Output:
sdf5$Values
-92
-91
-90
...
2
3
-162
-161
...
7
8
...
My actual dataframe has 62 rows. My current code gives me frequencies > 100 (should have a maximum of 62 for any integer). The code worked on a dummy dataframe, so I'm not sure where things are going wrong.
list <- mapply(":", df$C2, df$C1)
df3 <- do.call(rbind.data.frame, list)
sdf3 <- stack(df3)
sdf4 <- as.data.frame(sdf3$values)
sdf5 <- rename(sdf4, Values = 1)
a <- ggplot(sdf5, aes(x=Values)) +
geom_histogram(binwidth = 1, center=0)
I'm not sure what exactly goes wrong, but I think the rbind.data.frame is causing some troubles with the list input. As an alternative:
library(ggplot2)
df <- read.table(text = " C1 C2
A 3 -92
B 8 -162
C 20 -97
D 50 -76")
list <- mapply(":", df$C2, df$C1)
df2 <- data.frame(Values = do.call(c, list))
ggplot(df2, aes(x=Values)) +
geom_histogram(binwidth = 1, center=0)
Created on 2021-02-08 by the reprex package (v1.0.0)
There must be something off going on with the stack function, you can check it using table. To put all list numbers into a single vector I'd use unlist.
df=data.frame(C1=floor(runif(80,0,200)),C2=floor(runif(80,-200,0)))
list <- mapply(":", df$C2, df$C1)
df3 <- do.call(rbind.data.frame, list)
sdf3 <- stack(df3)
sdf4 <- data.frame("Values"=sdf3$values)
table(sdf4)
# This returns the count of each unique value and some go up to 200,
# notably the limits of my unif distribution
If you use unlist, it gives the desired result.
df=data.frame(C1=floor(runif(80,0,200)),C2=floor(runif(80,-200,0)))
list <- mapply(":", df$C2, df$C1)
vec <- data.frame("Values"=unlist(list))
a <- ggplot(vec, aes(x=Values)) +
geom_histogram(binwidth = 1, center=0)
I don't know the stack function, but the problem must be there somehow.

Improve sampling between R dataframes

I am trying to improve speed in a case where I need to use data from dataframes and sample from other dataframes.
First I need to draw the number of samples I want from df_obs.
Then I need to determine a subset from where to sample based on which month I am in (the subset command).
Then I want to sample from the corresponding sample dataframe.
And finally put it all together in a new dataframe.
The code below Works, but it is far to slow, when I have to repeat this 1000 times. Is there an alternative method which uses apply functions better? Or perhaps some data.table function?
#Sample function to sample correct in case of only one value to sample from
resample <- function(x, ...) x[sample.int(length(x), ...)]
#Creating dummy data
no_pr_month <- sort(sample(67:120, 20))
df_obs <- data.frame(replicate(20,sample(0:5,1000,rep=TRUE)))
colnames(df_obs) <- no_pr_month
amount <- sample(50:50000,200)
month <- sample(no_pr_month,200, rep=TRUE)
df <- data.frame(month,amount)
df_sum <- data.frame(matrix(NA, ncol = 20, nrow=1000))
#The far too slow loop
for (k in 1:length(no_pr_month)){
a <- df_obs[,k]
df_sample <- subset(df, df$month == names(df_obs[k]))
df_out <- sapply(a, function(x) sum(resample(df_sample$amount, x,replace = TRUE)))
df_sum[,k] <- df_out
}
Note: before creating your data, I inserted set.seed(000) for consistent results
Even when using the data.table package, it's best to keep data organized in a "tidy" way: row-based.
So we'll start by changing your df_obs dataset to a long-form data.table.
library(data.table)
df_obs_long <- data.table(
month = as.integer(rep(names(df_obs), each = nrow(df_obs))),
obs_count = unlist(df_obs)
)
df_obs_long
# month obs_count
# 1: 69 4
# 2: 69 5
# 3: 69 1
# 4: 69 3
# 5: 69 0
# ---
# 19996: 116 4
# 19997: 116 1
# 19998: 116 2
# 19999: 116 3
# 20000: 116 5
Next we'll define a function that takes a vector of sample sizes and the number of the month to draw samples from. The function will return a vector of sample sums for each of the sizes given.
Making df a data.table doesn't save much as far as written code, but can cut down runtime by a good amount.
setDT(df)
sample_and_sum_month <- function(sizes, month_number) {
choices <- df[month == month_number, amount]
vapply(
sizes,
FUN.VALUE = numeric(1),
FUN = function(s) {
sum(resample(choices, size = s, replace = TRUE))
}
)
}
sample_and_sum_month(1:3, 69)
# [1] 12729 55068 28605
Finally, we can just add the sums as a new column in df_obs_long.
df_obs_long[
,
sample_sum := sample_and_sum_month(obs_count, .BY[["month"]]),
by = "month"
]
df_obs_long
# month obs_count sample_sum
# 1: 69 4 82662
# 2: 69 5 160761
# 3: 69 1 5743
# 4: 69 3 108783
# 5: 69 0 0
# ---
# 19996: 116 4 56792
# 19997: 116 1 22570
# 19998: 116 2 35337
# 19999: 116 3 64734
# 20000: 116 5 69075

Find top deciles from dataframe by group

I am attempting to create new variables using a function and lapply rather than working right in the data with loops. I used to use Stata and would have solved this problem with a method similar to that discussed here.
Since naming variables programmatically is so difficult or at least awkward in R (and it seems you can't use indexing with assign), I have left the naming process until after the lapply. I am then using a for loop to do the renaming prior to merging and again for the merging. Are there more efficient ways of doing this? How would I replace the loops? Should I be doing some sort of reshaping?
#Reproducible data
data <- data.frame("custID" = c(1:10, 1:20),
"v1" = rep(c("A", "B"), c(10,20)),
"v2" = c(30:21, 20:19, 1:3, 20:6), stringsAsFactors = TRUE)
#Function to analyze customer distribution for each category (v1)
pf <- function(cat, df) {
df <- df[df$v1 == cat,]
df <- df[order(-df$v2),]
#Divide the customers into top percents
nr <- nrow(df)
p10 <- round(nr * .10, 0)
cat("Number of people in the Top 10% :", p10, "\n")
p20 <- round(nr * .20, 0)
p11_20 <- p20-p10
cat("Number of people in the 11-20% :", p11_20, "\n")
#Keep only those customers in the top groups
df <- df[1:p20,]
#Create a variable to identify the percent group the customer is in
top_pct <- integer(length = p10 + p11_20)
#Identify those in each group
top_pct[1:p10] <- 10
top_pct[(p10+1):p20] <- 20
#Add this variable to the data frame
df$top_pct <- top_pct
#Keep only custID and the new variable
df <- subset(df, select = c(custID, top_pct))
return(df)
}
##Run the customer distribution function
v1Levels <- levels(data$v1)
res <- lapply(v1Levels, pf, df = data)
#Explore the results
summary(res)
# Length Class Mode
# [1,] 2 data.frame list
# [2,] 2 data.frame list
print(res)
# [[1]]
# custID top_pct
# 1 1 10
# 2 2 20
#
# [[2]]
# custID top_pct
# 11 1 10
# 16 6 10
# 12 2 20
# 17 7 20
##Merge the two data frames but with top_pct as a different variable for each category
#Change the new variable name
for(i in 1:length(res)) {
names(res[[i]])[2] <- paste0(v1Levels[i], "_top_pct")
}
#Merge the results
res_m <- res[[1]]
for(i in 2:length(res)) {
res_m <- merge(res_m, res[[i]], by = "custID", all = TRUE)
}
print(res_m)
# custID A_top_pct B_top_pct
# 1 1 10 10
# 2 2 20 20
# 3 6 NA 10
# 4 7 NA 20
Stick to your Stata instincts and use a single data set:
require(data.table)
DT <- data.table(data)
DT[,r:=rank(v2)/.N,by=v1]
You can see the result by typing DT.
From here, you can group the within-v1 rank, r, if you want to. Following Stata idioms...
DT[,g:={
x = rep(0,.N)
x[r>.8] = 20
x[r>.9] = 10
x
}]
This is like gen and then two replace ... if statements. Again, you can see the result with DT.
Finally, you can subset with
DT[g>0]
which gives
custID v1 v2 r g
1: 1 A 30 1.000 10
2: 2 A 29 0.900 20
3: 1 B 20 0.975 10
4: 2 B 19 0.875 20
5: 6 B 20 0.975 10
6: 7 B 19 0.875 20
These steps can also be chained together:
DT[,r:=rank(v2)/.N,by=v1][,g:={x = rep(0,.N);x[r>.8] = 20;x[r>.9] = 10;x}][g>0]
(Thanks to #ExperimenteR:)
To rearrange for the desired output in the OP, with values of v1 in columns, use dcast:
dcast(
DT[,r:=rank(v2)/.N,by=v1][,g:={x = rep(0,.N);x[r>.8] = 20;x[r>.9] = 10;x}][g>0],
custID~v1)
Currently, dcast requires the latest version of data.table, available (I think) from Github.
You don't need the function pf to achieve what you want. Try dplyr/tidyr combo
library(dplyr)
library(tidyr)
data %>%
group_by(v1) %>%
arrange(desc(v2))%>%
mutate(n=n()) %>%
filter(row_number() <= round(n * .2)) %>%
mutate(top_pct= ifelse(row_number()<=round(n* .1), 10, 20)) %>%
select(custID, top_pct) %>%
spread(v1, top_pct)
# custID A B
#1 1 10 10
#2 2 20 20
#3 6 NA 10
#4 7 NA 20
The idiomatic way to do this kind of thing in R would be to use a combination of split and lapply. You're halfway there with your use of lapply; you just need to use split as well.
lapply(split(data, data$v1), function(df) {
cutoff <- quantile(df$v2, c(0.8, 0.9))
top_pct <- ifelse(df$v2 > cutoff[2], 10, ifelse(df$v2 > cutoff[1], 20, NA))
na.omit(data.frame(id=df$custID, top_pct))
})
Finding quantiles is done with quantile.

Removing rows of data in R

What's the most reliable way to remove matching Ids from two large data frames in large?
For example, I have a list of participants who do not want to be contacted (n=200). I would like to remove them from my dataset of over 100 variables and 200,000 observations.
This is the list of 200 participants ids that I need to remove from the dataset.
exclude=read.csv("/home/Project/file/excludeids.csv", header=TRUE, sep=",")
dataset.exclusion<- dataset[-which(exclude$ParticipantId %in% dataset$ParticipantId ), ]
Is this the correct command to use?
I don't think this command is doing what I want, because when I verify with the following: length(which(dataset.exclusion$ParticipantId %in% exclusion$ParticipantId))
I don't get 0.
Any insight?
You can do this for example:
sample1[!sample1$ParticipantID %in%
unique(exclusion$ParticipantId),]
Something like this?
library(data.table)
dataset <- data.table(
a = c(1,2,3,4,5,6),
b = c(11,12,13,14,15,16),
d = c(21,22,23,24,25,26)
)
setkeyv(dataset, c('a','b'))
ToExclude <- data.table(
a = c(1,2,3),
b = c(11,12,13)
)
dataset[!ToExclude]
# a b d
# 1: 4 14 24
# 2: 5 15 25
# 3: 6 16 26

How to create single table by extracting certain cells from multiple CSV files

I am wondering if it is possible to create a new dataframe with certain cells from each file from the working directory. for example say If I have 2 data frame like this (please ignore the numbers as they are random):
Say in each dataset, row 4 is the sum of my value and Row 5 is number of missing values. If I represent number of missing values as "M" and Sum of coloumns as "N", what I am trying to acheive is the following table:
So each file 'N' and 'M' are in 1 single row.
I have many files in the directory so I have read them in a list, but not sure what would be the best way to perform such task on a list of files.
this is my sample code for the tables I have shown and how I read them in list:
##Create sample data
df = data.frame(Type = 'wind', v1=c(1,2,3,100,50), v2=c(4,5,6,200,60), v3=c(6,7,8,300,70))
df2 =data.frame(Type = 'test', v1=c(3,2,1,400,40), v2=c(2,3,4,500,30), v3=c(6,7,8,600,20))
# write to directory
write.csv(df, file = "sample1.csv", row.names = F)
write.csv(df2, file = "sample2.csv", row.names = F)
# read to list
mycsv = dir(pattern=".csv")
n <- length(mycsv)
mylist <- vector("list", n)
for(i in 1:n) mylist[[i]] <- read.csv(mycsv[i],header = TRUE)
I would be really greatful if you could give me some suggestion about if this possible and how I should approch?
Many thanks,
Ayan
This should work:
processFile <- function(File) {
d <- read.csv(File, skip = 4, nrows = 2, header = FALSE,
stringsAsFactors = FALSE)
dd <- data.frame(d[1,1], t(unlist(d[-1])))
names(dd) <- c("ID", "v1N", "V1M", "v2N", "V2M", "v3N", "V3M")
return(dd)
}
ll <- lapply(mycsv, processFile)
do.call(rbind, ll)
# ID v1N V1M v2N V2M v3N V3M
# 1 wind 100 50 200 60 300 70
# 2 test 400 40 500 30 600 20
(The one slightly tricky/unusual bit comes in that third line of processFile(). Here's a code snippet that should help you see how it accomplishes what it does.)
(d <- data.frame(a="wind", b=1:2, c=3:4))
# a b c
# 1 wind 1 3
# 2 wind 2 4
t(unlist(d[-1]))
# b1 b2 c1 c2
# [1,] 1 2 3 4
CAVEAT: I'm not sure I fully understand what you want. I think you're reading in a list and want to select certain dataframes from that list with the same rows from that list. Then you want to create a data frame of those rows and go from long to wide format.
LIST <- lapply(2:3, function(i) {
x <- mylist[[i]][4:5, ]
x <- data.frame(x, row = factor(rownames(x)))
return(x)
}
)
DF <- do.call("rbind", LIST) #lets you bind an unknown number of rows from a list
levels(DF$row) <- list(M =4, N = 5) #recodes rows 4 and 5 with M and N
wide <- reshape(DF, v.names=c("v1", "v2", "v3"), idvar=c("Type"),
timevar="row", direction="wide") #reshape from long to wide
rownames(wide) <- 1:nrow(wide) #give proper row names
wide
This yields:
Type v1.M v2.M v3.M v1.N v2.N v3.N
1 wind 100 200 300 50 60 70
2 test 400 500 600 40 30 20

Resources