Create a categorical variable - r

I want to categorized one variable with the next conditionals:
0 - 4: "fail"
5 - 7: "good"
8 - 10: "excellent"
None of the above: NA
I tried using the recode function
The values of variable is numeric
segur <- data$segur
Created a new variable using recode
dt1 <- recode(segur, "c(0,4)='suspenso';c(5, 7)='aceptable';c(8,10)='excelente'; else='NA'")
dt1
How can I fix?

using factor in base R
Data:
# set random seed
set.seed(1L)
# without any NA
x1 <- sample(x = 1:10, size = 20, replace=TRUE)
# with NA
x2 <- sample(x = c(1:10, NA), size = 20, replace=TRUE)
Code:
# without any NA
as.character(factor(x1, levels = c(0:10), labels = c(rep("fail", 5), rep("good", 3), rep("excellent", 3)), exclude=NA))
# with NA
as.character(factor(x2, levels = c(0:10), labels = c(rep("fail", 5), rep("good", 3), rep("excellent", 3)), exclude=NA))

I guess you can use cut like below
cut(segur, c(0, 4, 7, 10), labels = c("fail", "good", "excellent"))
Example
> segur
[1] 6 1 4 -2 -1 10 8 0 5 9
> cut(segur, c(0, 4, 7, 10), labels = c("fail", "good", "excellent"))
[1] good fail fail <NA> <NA> excellent excellent
[8] <NA> good excellent
Levels: fail good excellent

Here is a solution using the fmtr package. You can create a categorical format using the value and condition functions, and then apply the format to the numeric data using the fapply function. Here is an example:
library(fmtr)
# Create sample data
df <- read.table(header = TRUE, text = '
ID segur
1 0
2 8
3 5
4 11
5 7')
# Create format
fmt <- value(condition(x >= 0 & x <=4, "fail"),
condition(x >= 5 & x <=7, "good"),
condition(x >= 8 & x <= 10, "excellent"),
condition(TRUE, NA))
# Apply categorization
df$segur_cat <- fapply(df$segur, fmt)
# View results
df
# ID segur segur_cat
# 1 1 0 fail
# 2 2 8 excellent
# 3 3 5 good
# 4 4 11 <NA>
# 5 5 7 good

Related

Replace values with subseting operator [

I have the following data:
set.seed(1)
df_1 <- data.frame(x = replicate(n = 2, expr = sample(x = 1:3, size = 20, replace = T)),
y = as.factor(sample(x = 1:5, size = 20, replace = TRUE)))
I want replace the numbers >=2 by 9 in x.1 and x.2 simultaneoulsy:
df_1[df_1$x.1, df_1$x.2 >= 2] <- 9
Error in [<-.data.frame(*tmp*, df_1$x.1, df_1$x.2 >= 2, value = 9) :
duplicate subscripts for columns
And replace the number 3 by 99 in y.
df_1$y[df_1$y %in% c('3')] <- 99
Warning message:
In [<-.factor(*tmp*, df_1$y %in% c("3"), value = c(2L, 5L, 2L, :
invalid factor level, NA generated
Tks.
We can use replace
df_1[1:2] <- replace(df_1[1:2], df_1[1:2] >=2, 9)
Or another option is create the logical matrix on the subset of 'x.' columns, extract the values and assign it to 9
df_1[1:2][df_1[1:2] >= 2] <- 9
For changing the factor, we either needs to call factor again or add levels beforehand
levels(df_1$y) <- c(levels(df_1$y), "99")
df_1$y
#[1] 4 4 4 2 4 1 1 4 1 2 3 2 2 5 2 1 3 3 4 3
#Levels: 1 2 3 4 5 99
df_1$y[df_1$y == '3'] <- '99'
df_1$y
#[1] 4 4 4 2 4 1 1 4 1 2 99 2 2 5 2 1 99 99 4 99
##Levels: 1 2 3 4 5 99
Or as #thelatemail mentioned, if we are dropping the levels while doing the replacement
levels(df_1$y)[levels(df_1$y) == '3'] <- "99"
Or can use fct_recode from forcats
library(forcats)
df_1$y <- fct_recode(df_1$y, "99" = "3")

Remove rows with zero-variance in R

I have a dataframe of survey responses (rows = participants, columns = question responses). Participants would respond to 50 questions on a 5-point Likert scale. I would like to remove participants who answered 5 across the 50 questions as they have zero-variance and likely to bias my results.
I have seen the nearZeroVar()function, but was wondering if there's a way to do this in base R?
Many thanks,
R
If you had this dataframe:
df <- data.frame(col1 = rep(1, 10),
col2 = 1:10,
col3 = rep(1:2, 5))
You could calculate the variance of each column and select only those columns where the variance is not 0 or greater than or equal to a certain threshold which is close to what nearZeroVar() would do:
df[, sapply(df, var) != 0]
df[, sapply(df, var) >= 0.3]
If you wanted to exclude rows, you could do something similar, but loop through the rows instead and then subset:
df[apply(df, 1, var) != 0, ]
df[apply(df, 1, var) >= 0.3, ]
Assuming you have data like this.
survey <- data.frame(participants = c(1:10),
q1 = c(1,2,5,5,5,1,2,3,4,2),
q2 = c(1,2,5,5,5,1,2,3,4,3),
q3 = c(3,2,5,4,5,5,2,3,4,5))
You can do the following.
idx <- which(apply(survey[,-1], 1, function(x) all(x == 5)) == T)
survey[-idx,]
This will remove rows where all values equal 5.
# Dummy data:
df <- data.frame(
matrix(
sample(1:5, 100000, replace =TRUE),
ncol = 5
)
)
names(df) <- paste0("likert", 1:5)
df$id <- 1:nrow(df)
head(df)
likert1 likert2 likert3 likert4 likert5 id
1 1 2 4 4 5 1
2 5 4 2 2 1 2
3 2 1 2 1 5 3
4 5 1 3 3 2 4
5 4 3 3 5 1 5
6 1 3 3 2 3 6
dim(df)
[1] 20000 6
# Clean out rows where all likert values are 5
df <- df[rowSums(df[grepl("likert", names(df))] == 5) != 5, ]
nrow(df)
[1] 19995
Stealing #AshOfFire's data, with small modification as you say you only have answers in columns and not participants :
survey <- data.frame(q1 = c(1,2,5,5,5,1,2,3,4,2),
q2 = c(1,2,5,5,5,1,2,3,4,3),
q3 = c(3,2,5,4,5,5,2,3,4,5))
survey[!apply(survey==survey[[1]],1,all),]
# q1 q2 q3
# 1 1 1 3
# 4 5 5 4
# 6 1 1 5
# 10 2 3 5
the equality test builds a data.frame filled with Booleans, then with apply we keep rows that aren't always TRUE.

data.table - efficiently manipulate large data set

I am amazed by the blazing speed of data.table. The coding below does exactly what I need however when executed on a large table it does not perform very well.
convinced that this can be done faster with data.table but I do not see how.
Output
The output needs to be a matrix with the rownames a regular sequence of days.
For each column separately:
All values before the first value need to be NA
All values after the last value need to be NA
Between the first and the last value 0 need to be added as the do not exist in the input table
The following coding shows how the result should look like:
M <-
matrix(c(NA, NA, NA, 2, 0, 1, 3, 0, 2 , NA,
NA, NA, 3, 1, 3, 2, 1, 2, NA, NA),
ncol = 2,
dimnames = list(as.character((Sys.Date() + 0:9)),
c("E1", "E2")))
Output example
## E1 E2
## 2017-01-27 NA NA
## 2017-01-28 NA NA
## 2017-01-29 NA 2
## 2017-01-30 2 2
## 2017-01-31 0 2
## 2017-02-01 3 1
## 2017-02-02 1 3
## 2017-02-03 0 3
## 2017-02-04 2 NA
## 2017-02-05 NA NA
Input
The following table shows the source/input for the coding/function:
DS <- data.table(
E = c(rep("E1", 4), rep("E2", 6)),
C = c(c(Sys.Date() + c(3, 5, 6, 8)),
c(Sys.Date() + c(2, 3, 4, 5, 6, 7))),
S = round(runif(n = 10,min = 1, max = 3), 0),
key = c("E", "C"))
## E C S
## 1: E1 2017-01-30 3
## 2: E1 2017-02-01 1
## 3: E1 2017-02-02 2
## 4: E1 2017-02-04 1
## 5: E2 2017-01-29 3
## 6: E2 2017-01-30 2
## 7: E2 2017-01-31 3
## 8: E2 2017-02-01 1
## 9: E2 2017-02-02 2
## 10: E2 2017-02-03 3
Input example
Code working
The following few lines do exactly what I need and is simple. However it is not efficient.
The real table has 700 unique C values and 2 Million E values.
# Create the regular time line per day
CL <- c(C= (Sys.Date() + 0:9))
# Determine first and last per E
DM <- DS[, .(MIN = min(C), MAX = max(C)), by =.(E)]
# Generate all combinations
CJ <- CJ(E = DS$E, C = CL, unique = TRUE)
# Join
DC <- DS[CJ, on = .(E, C)][!is.na(E)]
# replace NA by 0
DC[is.na(S), S:=0]
# Lead-in
DC[DM, on=.(E, C<MIN), S:=NA]
# Lead-out
DC[DM, on=.(E, C>MAX), S:=NA]
# Cast to matrix format
DC2 <- dcast(
data = DC, formula = C ~ E,
fun.aggregate = sum, value.var = "S")
# coerce to matrix
M3 <- as.matrix(DC2[, -1])
# add row nanes
rownames(M3) <- format(CL, "%Y-%m-%d")
I made some long, un-readable, clumsy coding which creates the matrix with 1.2B cells in 35 secs. This must be possible as quick but far more elegant with data.table, however not like this.
A data.table, like a data.frame underneath everything is a list (with length = number of columns)
200 Million columns is a lot of columns - this will make anything slow.
The description of the conversion to "wide" will bloat the data with large number of NA values. You can almost certainly perform the analysis you need on the "long form" and using keys.
It isn't clear from your question what you need, but you can calculate the various sums
# convert to an IDate
DT[, CALDAY := as.IDate(CALDAY)]
# get range of dates
rangeDays <- DT[,range(CALDAY)]
all_days <- as.IDate(seq(rangeDays[1],rangeDays[2], by=1))
# create sums
DT_sum <- DT[, list(VALUE= sum(VALUE)), keyby = list(ENTITY, CALDAY)]
and then index using entity and dates.
DT_sum[.("2a8605e2-e283-11e6-a3bb-bbe3fd226f8d", all_days)]
and if you need to replace NA with 0
na_replace <- function(x,repl=0){x[is.na(x)]<-repl;x}
DT_sum[.("2a8605e2-e283-11e6-a3bb-bbe3fd226f8d", all_days), na_replace(VALUE)]
This does the trick. But still the performance is not good.
It takes DS as input parameter. The result is a data.table which should be coerced to matrix by:
M <- as.matrix(build_timeseries_DT(DS))
Function
build_timeseries_DT <- function(DS){
# regular time serie for complete range with index
dtC <- data.table(
CAL = seq(min(DS$C), max(DS$C), by = "day"))[, idx:= 1:.N]
# add row index (idx) to sales
DQ <- dtC[DS, on = "CAL"]
setkey(DQ, "ENT")
# calculate min index per ENT
DM <- DQ[, .(MIN = min(idx), MAX = max(idx)), by = .(ENT)]
# allocate memory, assign 0 and set rownames by reference
DT <- dtC[, .(CAL)][, (DM[, ENT]):= 0L][, CAL:= NULL]
setattr(DT, "row.names", format(dtC$CAL, "%Y-%m-%d"))
# Set NA for the Lead-in and out, next populate values by ref
for(j in colnames(DT)){
set(x = DT,
i = c(1L:(DM[j, MIN]), (DM[j, MAX]):DT[, .N]),
j = j,
value = NA )
set(x = DT,
i = DQ[j, idx],
j = j,
value = DQ[j, SLS] )}
return(DT)
}
Test Data
DS <- data.table(
ENT = c("A", "A", "A", "B", "B", "C", "C", "C", "D", "D"),
CAL = c(Sys.Date() + c(0, 5, 6, 3, 8, 1, 2, 9, 3, 5)),
SLS = as.integer(c(1, 2, 1, 2, 3, 1, 2, 3, 2, 1)),
key = c("ENT", "CAL"))
ENT CAL SLS
1: A 2017-01-28 1
2: A 2017-02-02 2
3: A 2017-02-03 1
4: B 2017-01-31 2
5: B 2017-02-05 3
6: C 2017-01-29 1
7: C 2017-01-30 2
8: C 2017-02-06 3
9: D 2017-01-31 2
10: D 2017-02-02 1
Result
as.matrix(build_timeseries_DT(DS))
A B C D
[1,] 1 NA NA NA
[2,] 0 NA 1 NA
[3,] 0 NA 2 NA
[4,] 0 2 0 2
[5,] 0 0 0 0
[6,] 2 0 0 1
[7,] 1 0 0 NA
[8,] NA 0 0 NA
[9,] NA 3 0 NA
[10,] NA NA 3 NA
result with colors

weighted table data frame with plyr

I'm working with survey data consisting of integer value responses for multiple questions (y1, y2, y3, ...) and a weighted count assigned to each respondent, like this:
foo <- data.frame(wcount = c(10, 1, 2, 3), # weighted counts
y1 = sample(1:5, 4, replace=T), # numeric responses
y2 = sample(1:5, 4, replace=T), #
y3 = sample(1:5, 4, replace=T)) #
>foo
wcount y1 y2 y3
1 10 5 5 5
2 1 1 4 4
3 2 1 2 5
4 3 2 5 3
and I'd like to transform this into a consolidated data frame version of a weighted table, with the first column representing the response values, and the next 3 columns representing the weighted counts. This can be done explicitly by column using:
library(Hmisc)
ty1 <- wtd.table(foo$y1, foo$wcount)
ty2 <- wtd.table(foo$y2, foo$wcount)
ty3 <- wtd.table(foo$y3, foo$wcount)
bar <- merge(ty1, ty2, all=T, by="x")
bar <- merge(bar, ty3, all=T, by="x")
names(bar) <- c("x", "ty1", "ty2", "ty3")
bar[is.na(bar)]<-0
>bar
x ty1 ty2 ty3
1 1 3 0 0
2 2 3 2 0
3 3 0 0 3
4 4 0 1 1
5 5 10 13 12
I suspect there is a way of automating this with plyr and numcolwise or ddply. For instance, the following comes close, but I'm not sure what else is needed to finish the job:
library(plyr)
bar2 <- numcolwise(wtd.table)(foo[c("y1","y2","y3")], foo$wcount)
>bar2
y1 y2 y3
1 1, 2, 5 2, 4, 5 3, 4, 5
2 3, 3, 10 2, 1, 13 3, 1, 12
Any thoughts?
Not a plyr answer, but this struck me as a reshaping/aggregating problem that could be tackled straightforwardly using functions from package reshape2.
First, melt the dataset, making a column of the response value which can be named x (the unique values in y1-y3).
library(reshape2)
dat2 = melt(foo, id.var = "wcount", value.name = "x")
Now this can be cast back wide with dcast, using sum as the aggregation function. This puts y1-y3 back as columns with the sum of wcount for each value of x.
# Cast back wide using the values within y1-y3 as response values
# and filling with the sum of "wcount"
dcast(dat2, x ~ variable, value.var = "wcount", fun = sum)
Giving
x y1 y2 y3
1 1 3 0 0
2 2 3 2 0
3 3 0 0 3
4 4 0 1 1
5 5 10 13 12
you are describing a survey data set that uses replicate weights. see http://asdfree.com/ for many, many examples but for recs, do something like this:
library(survey)
x <- read.csv( "http://www.eia.gov/consumption/residential/data/2009/csv/recs2009_public.csv" )
rw <- read.csv( "http://www.eia.gov/consumption/residential/data/2009/csv/recs2009_public_repweights.csv" )
y <- merge( x , rw )
# create a replicate-weighted survey design object
z <- svrepdesign( data = y , weights = ~NWEIGHT , repweights = "brr_weight_[0-9]" )
# now run all of your analyses on the object `z` ..
# see the `survey` package homepage for details
# distribution
svymean( ~ factor( BASEHEAT ) , z )
# mean
svymean( ~ TOTHSQFT , z )

Groupby bins and aggregate in R

I have data like (a,b,c)
a b c
1 2 1
2 3 1
9 2 2
1 6 2
where 'a' range is divided into n (say 3) equal parts and aggregate function calculates b values (say max) and grouped by at 'c' also.
So the output looks like
a_bin b_m(c=1) b_m(c=2)
1-3 3 6
4-6 NaN NaN
7-9 NaN 2
Which is MxN where M=number of a bins, N=unique c samples or all range
How do I approach this? Can any R package help me through?
A combination of aggregate, cut and reshape seems to work
df <- data.frame(a = c(1,2,9,1),
b = c(2,3,2,6),
c = c(1,1,2,2))
breaks <- c(0, 3, 6, 9)
# Aggregate data
ag <- aggregate(df$b, FUN=max,
by=list(a=cut(df$a, breaks, include.lowest=T), c=df$c))
# Reshape data
res <- reshape(ag, idvar="a", timevar="c", direction="wide")
There would be easier ways.
If your dataset is dat
res <- sapply(split(dat[, -3], dat$c), function(x) {
a_bin <- with(x, cut(a, breaks = c(1, 3, 6, 9), include.lowest = T, labels = c("1-3",
"4-6", "7-9")))
c(by(x$b, a_bin, FUN = max))
})
res1 <- setNames(data.frame(row.names(res), res),
c("a_bin", "b_m(c=1)", "b_m(c=2)"))
row.names(res1) <- 1:nrow(res1)
res1
a_bin b_m(c=1) b_m(c=2)
1 1-3 3 6
2 4-6 NA NA
3 7-9 NA 2
I would use a combination of data.table and reshape2 which are both fully optimized for speed (not using for loops from apply family).
The output won't return the unused bins.
v <- c(1, 4, 7, 10) # creating bins
temp$int <- findInterval(temp$a, v)
library(data.table)
temp <- setDT(temp)[, list(b_m = max(b)), by = c("c", "int")]
library(reshape2)
temp <- dcast.data.table(temp, int ~ c, value.var = "b_m")
## colnames(temp) <- c("a_bin", "b_m(c=1)", "b_m(c=2)") # Optional for prettier table
## temp$a_bin<- c("1-3", "7-9") # Optional for prettier table
## a_bin b_m(c=1) b_m(c=2)
## 1 1-3 3 6
## 2 7-9 NA 2

Resources