Psych's reverse.code function producing NAs in R - r

When using reverse.code in R, the values in my ID column (which are not meant to be reversed) turn into NA once the ID value exceeds 999 (I have 10,110 observations).
Does anyone know if there is anything I can do to fix this?
Is there another function I can use to reverse these items without loosing data?
Here is my code:
library(psych)
keys <- c(1,-1,-1,-1) #Where column 1 = ID and the rest are my variables to be reversed
rev_dat2 <- reverse.code(keys, rev_dat)
Thanks!

Here is the relevant line of the source code of reverse.code(), where new is the object holding the reverse-coded data:
new[abs(new) > 999] <- NA
As you can see, setting values larger than 9999 to missing is hard-coded into the routine. You could write a new version of the function that didn't do that. For example, in the function below, we just make a much larger threshold:
my.reverse.code <- function (keys, items, mini = NULL, maxi = NULL)
{
if (is.vector(items)) {
nvar <- 1
}
else {
nvar <- dim(items)[2]
}
items <- as.matrix(items)
if (is.null(maxi)) {
colMax <- apply(items, 2, max, na.rm = TRUE)
}
else {
colMax <- maxi
}
if (is.null(mini)) {
colMin <- apply(items, 2, min, na.rm = TRUE)
}
else {
colMin <- mini
}
colAdj <- colMax + colMin
if (length(keys) < nvar) {
temp <- keys
if (is.character(temp))
temp <- match(temp, colnames(items))
keys <- rep(1, nvar)
keys[temp] <- -1
}
if (is.list(keys) | is.character(keys)) {
keys <- make.keys(items, keys)
keys <- diag(keys)
}
keys.d <- diag(keys, nvar, nvar)
items[is.na(items)] <- -99999999999
reversed <- items %*% keys.d
adj <- abs(keys * colAdj)
adj[keys > 0] <- 0
new <- t(adj + t(reversed))
new[abs(new) > 99999999999] <- NA
colnames(new) <- colnames(items)
colnames(new)[keys < 0] <- paste(colnames(new)[keys < 0],
"-", sep = "")
return(new)
}
The reason they used a numeric value threshold is that for the recoding they do to work, they needed all values to be numeric. So, they set missing values to -999 and then later turn them back into missing values. The same is done above, but with a lot bigger number.
keys <- c(1,-1,-1,-1) #Where column 1 = ID and the rest are my variables to be reversed
rev_dat <- data.frame(
id = 9998:10002,
x = 1:5,
y = 5:1,
z = 1:5
)
library(psych)
reverse.code(keys, rev_dat)
# id x- y- z-
# [1,] NA 5 1 5
# [2,] NA 4 2 4
# [3,] NA 3 3 3
# [4,] NA 2 4 2
# [5,] NA 1 5 1
my.reverse.code(keys, rev_dat)
# id x- y- z-
# [1,] 9998 5 1 5
# [2,] 9999 4 2 4
# [3,] 10000 3 3 3
# [4,] 10001 2 4 2
# [5,] 10002 1 5 1

Related

sum up results of a recursive function within the same recursive function

I want to build a data frame like
In the head I have a value of a number n
in factorial the factorial(n) which is a recursive function
in sum the sum of the previous values of the factiorials.
I write a recursive function that successfully generate the head and factorial columns but the still struggling with the sum column.
Thanks
Below R code
fact <- function(n, x){
if (n<=1){
return (n)
} else {
n*fact(n-1)
}
}
recurDf <- function(n, df){
if (n<=1){
df <- rbind (df, data.frame("value" = paste('Value', n) , "factorial" = n, "previous.sum" = 1) )
return (df)
} else {
if(is.null(df)) {
#df <- data.frame(matrix(ncol = 3, nrow = 0))
#colnames(df) <- c("value", "factorial", "previous.sum")
df <- data.frame("value"= 'va', "factorial" =0, "previous.sum" = 0)
}
rbind (recurDf(n-1,df), data.frame("value" = paste('Value', n) , "factorial" = fact(n), "previous.sum" = sum(recurDf(n-1,df)$factorial) ))
}
}
recurDf(4, NULL)
The following returns the factor of n in its first component and the cumulative sum of all factorials to n in its second argument.
fact2 <- function(n) {
if (n <= 1) c(1,1)
else {
prev <- Recall(n-1)
n * prev[1] + c(0, prev[2])
}
}
fact2(1)
## [1] 1 1
fact2(2)
## [1] 2 3
fact2(3)
## [1] 6 9
fact2(4)
## [1] 24 33
cbind(1:4, t(sapply(1:4, fact2)))
## [,1] [,2] [,3]
## [1,] 1 1 1
## [2,] 2 2 3
## [3,] 3 6 9
## [4,] 4 24 33
Is there a reason you need to do this recursively?
There are much simpler ways to get to your answer.
recurDf <- function(n){
df <- data.frame("value" = c(paste('Value',1:n)) , "factorial" = c(1:n))
df$factorial <- factorial(df$factorial)
df$previous.sum <- cumsum(df$factorial)
return (df)
}
recurDf(4)
This returns
value factorial previous.sum
1 Value 1 1 1
2 Value 2 2 3
3 Value 3 6 9
4 Value 4 24 33

R programming. Dataframe and subsets

Select only unique sets from a dataframe
here one set = one row of data frame.
syntax in r?
I want set concepts
see this example
1 1 2
1 2 1
1 2 3
o/p:
1 1 2
1 2 3
Here row1 and row2 form the sets ={1,2}, so I need only one copy of such rows.
This is my code for apriori algorithm. The function trim(data,r) is what i'hv been trying as a solution,but isn't working out.
uniqueItemSets<-function(data){
#unique items in basket
items <- c()
for(j in c(1:ncol(data))){
items <- c(items,unique(data[,j]))
}
items <- unique(items)
#return(as.list(items))
return(items)
}
F_itemset<-function(data,candidate,sup){
count <- rep(0,nrow(candidate))
for(i in c(1:nrow(data))){ #every transaction
for(j in c(1:nrow(candidate))){ #every dataset
x <- candidate[j,]
#x <- uniqueItemSets(x)
y <- data[i,]
#y <- uniqueItemSets(y)
if(all(x %in% y)){
count[j] <- count[j] + 1
}
}
}
#pruning
pp<-cbind(candidate,count)
pp<-as.data.frame(pp)
pp<-subset(pp,pp$count>=sup)
return(pp)
}
#k-itemset :k-value
makeItemSet<- function(candidate,k){
l<-combn(candidate,k,simplify=FALSE)
return(l)
}
aprio<-function(data,sup,conf,kmax){
C <- uniqueItemSets(data)
C <- as.data.frame(C)
for(k in c(2:kmax))
{
F <- F_itemset(data,C,sup)
F$count <- NULL
if(nrow(F)<k){
break;
}
F<-t(F)
C <- combn(F,k,simplify=FALSE)
C <- as.data.frame(C)
C <- t(C) #transpose
C<-unique(C)
trim(C,1)
}
return(F)
}
**
new <- data.frame()
trim<-function(data,r)
{
x<-as.data.frame(data[r,])
c<-c()
for(j in c(1:ncol(x))){
c<-c(c,x[,j])
}
c<-unique(c)
if(r+1<=nrow(data)){
for(i in c((r+1):nrow(data))){
t<-c()
for(j in c(1:ncol(data))){
t<-c(t,data[i,j])
}
t<-unique(t)
if(all(t %in% c) && all(c %in% t))
{
data[-i,]
}
}
new <- as.data.frame(data)
if(r+1 < nrow(data)){
trim(data[r+1:nrow(data),],r+1)
}
}
}
**
You can use apply with margin = 1 to execute row wise functions. The only thing to be aware of is that you need to transpose the outcome to get the order you need
d <- data.frame(number1 = c(1,1,1),
number2 = c(1,2,2),
number3 = c(2,1,3))
# next two statements can be run in one line of code if you want
d_sort <- t(apply(d, 1, sort))
# get rid of duplicate rows
unique(d_sort)
[,1] [,2] [,3]
[1,] 1 1 2
[2,] 1 2 3

Impute missing value for dummy variables in R by ratio of non_missing value

I am a new learner in R. Now, I made a trouble in imputing missing value and need your help. I have a data frame df like this:
a <- c(0,0,0,1,1,1,NA)
b <- c(1,0,1,0,1,0,NA)
c <- c(0,1,NA,0,1,0,1)
df <- data.frame(a,b,c)
I would like to impute the missing value of these variables according to the ratio of non NA value. For example: variable a has 50% of 0 and 50% of 1. So, NA value should be impute to 0 and 1 to keep the ratio the same.
Here are my code:
ratio0 <- function(x) { # ratio 0 of non NA missing value
table(x)[1]/sum(table(x)[1],table(x)[2])
}
ratio1 <- function(x) { # ratio 1 of non NA missing value
table(x)[2]/sum(table(x)[1],table(x)[2])
}
for(i in 1:ncol(df)) {
df[is.na(df[,i]), i] <- sample(c(0,1),sum(is.na(df[,i])),replace=TRUE,prob=c(ratio0(df[,i]),ratio1(df[,i])))
}
When applying the code above, I got the error: "Error in sample.int(length(x), size, replace, prob) : NA in probability vector".
Could you please let me know where my mistakes ?
Because when I try to apply code for a single variable, it works. For example, the code below to impute the missing value of the 3rd column of the dataframe df.
df[is.na(df[,3]), 3] <- sample(c(0,1), sum(is.na(df[,3])), replace=TRUE, prob=c(ratio0(df[,3]), ratio1(df[,3])))
Many thanks for your help.
If you want to make a ratio function I would do somthing like this
ratio <- function(x, which) {
b <- !is.na(x)
sum(x[b] == which) / sum(b)
}
but if I understood you correctly you could use the vector of not na values to sample from it directly
fun <- function(x) {
b <- is.na(x)
x[b] <- sample(x[!b], sum(b), replace=TRUE)
x
}
as.data.frame(lapply(df, fun), stringsAsFactors = FALSE)
We could construct a custom function and consequently apply() it to your data.frame columnwise.
# Function to replace NA's
replacer <- function(x) {
probs <- prop.table(table(x)) # Get proportions
y <- sample(c(0,1),length(which(is.na(x))), prob = probs, replace = TRUE)# Generate sample
x[is.na(x)] <- y # Replace values
return(x)
}
> apply(df,2,replacer)
# a b c
#[1,] 0 1 0
#[2,] 0 0 1
#[3,] 0 1 1
#[4,] 1 0 0
#[5,] 1 1 1
#[6,] 1 0 0
#[7,] 1 1 1

Assign an element value based on element adjacencies in R

I have a data frame with {0,1} indicating whether a product was Small, Medium or Large.
dat <- data.frame(Sm = c(1,0,0), Med = c(0,1,0), Lg = c(0,0,1))
Sm Med Lg
1 1 0 0
2 0 1 0
3 0 0 1
I'm looking to assign 1's to the 0's leading up to a 1 in a given row. For example in row 2 the product is a "Med", so I'm looking to assign a 1 to the 0 in the "Sm" column.
Allocation size is a consideration so I'm looking for a vectorized approach without using a for loop please. The final solution should output the following:
Sm Med Lg
1 1 0 0
2 1 1 0
3 1 1 1
I've tried several variations of the code below, but the closest I can get is a ragged array which assigns all of the 1's correctly while dropping the elements that have legitimate 0's.
apply(dat, 1, function(x) {
x[1:which.max(x)] <- 1
})
[1] 1 1 1
And below, which gets close but without the needed trailing 0's
apply(dat, 1, function(x) {
temp <- x[1:which.max(x)]
unlist(lapply(temp, function(y) {
y <- 1
}))
})
[[1]]
Sm
1
[[2]]
Sm Med
1 1
[[3]]
Sm Med Lg
1 1 1
First, convert to matrix and use max.col to get the index of the 1 in each row:
mat <- as.matrix(dat)
mc <- max.col(mat)
logical construction Overwrite the matrix:
mat = +(col(mat) <= mc)
or construct an index of matrix positions to change and change 'em:
logical indexing
mat[col(mat) < mc] <- 1L
# or
mat[which(col(mat) < mc)] <- 1L
matrix indexing
idx <- do.call( rbind, lapply( seq_along(mc), function(i)
if (i==1L) NULL
else cbind(i,seq_len(mc[i]-1))
))
mat[idx] <- 1L
vector indexing
nr <- nrow(mat)
idx <- unlist( lapply( seq_along(mc), function(i)
if (mc[i]==1L) NULL
else seq(from = i, by = nr, length.out = mc[i]-1L)
))
mat[idx] <- 1L
The help for all three indexing methods can be found at help("[<-").
This will do what you want.
dat[which(dat$Med==1),]$Sm = 1
dat[which(dat$Lg==1),]$Med = 1
dat[which(dat$Lg==1),]$Sm = 1

Identifying overlap zones in R raster package

Package:
raster
Data:
A rasterStack with 10 bands.
Each of the bands contains an image area surrounded by NAs
Bands are logical, i.e. "1" for image data and "0"/NA for surrounding area
The "image areas" of each band do not align completely with each other, though most have partial overlaps
Objective:
Write a fast function that can return either a rasterLayer or cell numbers for each "zone", for instance a pixel containing data only from bands 1 and 2 falls in zone 1, a pixel containing data only from bands 3 and 4 falls in zone 2, etc. If a rasterLayer is returned, I need to be able to match the zone value with band numbers later.
First attempt:
# Possible band combinations
values = integer(0)
for(i in 1:nlayers(myraster)){
combs = combn(1:nlayers(myraster), i)
for(j in 1:ncol(combs)){
values = c(values, list(combs[,j]))
}
}
# Define the zone finding function
find_zones = function(bands){
# The intersection of the bands of interest
a = subset(myraster, 1)
values(a) = TRUE
for(i in bands){
a = a & myraster[[i]]
}
# Union of the remaining bands
b = subset(myraster, 1)
values(b) = FALSE
for(i in seq(1:nlayers(myraster))[-bands]){
b = b | myraster[[i]]
}
#plot(a & !b)
cells = Which(a & !b, cells=TRUE)
return(cells)
}
# Applying the function
results = lapply(values, find_zones)
My current function takes a very long time to execute. Can you think of a better way? Note that I don't simply want to know how many bands have data at each pixel, I also need to know which bands. The purpose of this is to process different the areas differently afterwards.
Note also that the real-life scenario is a 3000 x 3000 or more raster with potentially more than 10 bands.
EDIT
Some sample data consisting of 10 offset image areas:
# Sample data
library(raster)
for(i in 1:10) {
start_line = i*10*1000
end_line = 1000000 - 800*1000 - start_line
offset = i * 10
data = c(rep(0,start_line), rep(c(rep(0,offset), rep(1,800), rep(0,200-offset)), 800), rep(0, end_line))
current_layer = raster(nrows=1000, ncols=1000)
values(current_layer) = data
if(i == 1) {
myraster = stack(current_layer)
} else {
myraster = addLayer(myraster, current_layer)
}
}
NAvalue(myraster) = 0 # You may not want to do this depending on your solution...
EDIT : Answer updated using Nick's trick and matrix multiplication.
You could try the following function, optimized by using Nick's trick and matrix multiplication. The bottleneck now is filling up stack with the seperate layers, but I guess the timings are quite OK now. Memory usage is a bit less, but given your data and the nature of R, I don't know if you can nibble of a bit without hampering the performance big time.
> system.time(T1 <- FindBands(myraster,return.stack=T))
user system elapsed
6.32 2.17 8.48
> system.time(T2 <- FindBands(myraster,return.stack=F))
user system elapsed
1.58 0.02 1.59
> system.time(results <- lapply(values, find_zones))
Timing stopped at: 182.27 35.13 217.71
The function returns either a rasterStack with the different level combinations present in the plot (that's not all possible level combinations, so you have some gain there already), or a matrix with the level number and level names. This allows you to do something like :
levelnames <- attr(T2,"levels")[T2]
to get the level names for each cell point. As shown below, you can easily put that matrix inside a rasterLayer object.
The function :
FindBands <- function(x,return.stack=F){
dims <- dim(x)
Values <- getValues(x)
nn <- colnames(Values)
vec <- 2^((1:dims[3])-1)
#Get all combinations and the names
id <- unlist(
lapply(1:10,function(x) combn(1:10,x,simplify=F))
,recursive=F)
nameid <- sapply(id,function(i){
x <- sum(vec[i])
names(x) <- paste(i,collapse="-")
x
})
# Nicks approach
layers <- Values %*% vec
# Find out which levels we need
LayerLevels <- unique(sort(layers))
LayerNames <- c("No Layer",names(nameid[nameid %in% LayerLevels]))
if(return.stack){
myStack <- lapply(LayerLevels,function(i){
r <- raster(nr=dims[1],nc=dims[2])
r[] <- as.numeric(layers == i)
r
} )
myStack <- stack(myStack)
layerNames(myStack) <- LayerNames
return(myStack)
} else {
LayerNumber <- match(layers,LayerLevels)
LayerNumber <- matrix(LayerNumber,ncol=dims[2],byrow=T)
attr(LayerNumber,"levels") <- LayerNames
return(LayerNumber)
}
}
Proof of concept, using the data of RobertH :
r <- raster(nr=10, nc=10)
r[]=0
r[c(20:60,90:93)] <- 1
s <- list(r)
r[]=0
r[c(40:70,93:98)] <- 1
s <- c(s, r)
r[]=0
r[50:95] <- 1
s <- (c(s, r))
aRaster <- stack(s)
> X <- FindBands(aRaster,return.stack=T)
> plot(X)
> X <- FindBands(aRaster,return.stack=F)
> X
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 1 1 1 1 1 1 1 1 1 1
[2,] 1 1 1 1 1 1 1 1 1 2
[3,] 2 2 2 2 2 2 2 2 2 2
[4,] 2 2 2 2 2 2 2 2 2 4
[5,] 4 4 4 4 4 4 4 4 4 8
[6,] 8 8 8 8 8 8 8 8 8 8
[7,] 7 7 7 7 7 7 7 7 7 7
[8,] 5 5 5 5 5 5 5 5 5 5
[9,] 5 5 5 5 5 5 5 5 5 6
[10,] 6 6 8 7 7 3 3 3 1 1
attr(,"levels")
[1] "No Layer" "1" "2" "3" "1-2" "1-3"
"2-3" "1-2-3"
> XX <- raster(ncol=10,nrow=10)
> XX[] <- X
> plot(XX)
I'm not familiar with raster, but from what I grasp from the above, you essentially have a 10*3000*3000 array, right?
If so, for each position in the raster (second and third indices, currow and curcol), you can calculate a unique identifier for its 'zone' by using binary: run i over the 'bands' (first index) and sum r[i,currow, curcol]*2^(i-1). Depending on the internal workings of raster, it should be possible to have a rather quick implementation of this.
This results in a new 'raster' of size 3000*3000 holding the unique identifiers of each position. Finding the unique values in there gives you back the zones that actually occur in your data, and reversing the binary logic should give you the bands that belong to a given zone.
Pardon me if my interpretation of raster is incorrect: then please ignore my musings. Either way not a complete solution.
How about this?
library(raster)
#setting up some data
r <- raster(nr=10, nc=10)
r[]=0
r[c(20:60,90:93)] <- 1
s <- list(r)
r[]=0
r[c(40:70,93:98)] <- 1
s <- c(s, r)
r[]=0
r[50:95] <- 1
s <- (c(s, r))
plot(stack(s))
# write a vectorized function that classifies the data
#
fun=function(x,y,z)cbind(x+y+z==0, x==1&y+z==0, y==1&x+z==0, z==1&x+y==0, x==0&y+z==2, y==0&x+z==2, z==0&x+y==2,x+y+z==3)
z <- overlay(s[[1]], s[[2]], s[[3]], fun=fun)
# equivalent to
#s <- stack(s)
#z <- overlay(s[[1]], s[[2]], s[[3]], fun=fun)
ln <- c("x+y+z==0", "x==1&y+z==0", "y==1&x+z==0", "z==1&x+y==0", "x==0&y+z==2", "y==0&x+z==2", "z==0&x+y==2", "x+y+z==3")
layerNames(z) <- ln
x11()
plot(z)
more generic:
s <- stack(s)
fun=function(x)as.numeric(paste(which(x==1), collapse=""))
x <- calc(s,fun)
this is not good when nlayers(s) has double digits ("1", "2" is the same as "12", and in those cases you could use the function below (fun2) instead:
fun2=function(x)as.numeric(paste(c(9, x), collapse=""))
x2 <- calc(s,fun2)
unique(x)
# [1] 1 2 3 12 13 23 123
unique(x2)
# [1] 9000 9001 9010 9011 9100 9101 9110 9111
for the toy example only:
plot(x)
text(x)
p=rasterToPolygons(x)
plot(p, add=T)
I've written code for #Nick Sabbe's suggestion, which I think is very concise and relatively fast. This assumes that the input rasterStack already has logical 1 or 0 data:
# Set the channels to 2^i instead of 1
bands = nlayers(myraster)
a = stack()
for (i in 1:bands) {
a = addLayer(a, myraster[[i]] * 2^i)
}
coded = sum(a)
#plot(coded)
values = unique(coded)[-1]
remove(a, myraster)
# Function to retrieve which coded value means which channels
which_bands = function(value) {
single = numeric()
for (i in bands:1) {
if ((0 < value) & (value >= 2^i)) {
value = value - 2^i
single = c(single, i)
}
}
return(single)
}

Resources