change numeric vector - r

I have a numeric vector (see below). I would like to change all numbers that are assigned to high_ to 1 and all low_ to 2.
c(high_X17 = 3, high_X18 = 4, high_X19 = 5, high_X20 = 3, high_X21 = 1,
high_X22 = 1, high_X23 = 2, high_X24 = 2, low_X25 = 6, low_X26 = 4,
low_X27 = 6, low_X28 = 5, low_X29 = 2, low_X30 = 1, low_X31 = 1,
low_X32 = 2)
result
high_X17 high_X18 high_X19 high_X20 high_X21 high_X22 high_X23 high_X24 low_X25 low_X26
1 1 1 1 1 1 1 1 2 2
low_X29 low_X30 low_X31 low_X32
2 2 2 2

Try the code below
x <- startsWith(names(x),"low_") + 1

You can use -
x[] <- as.integer(sub('_.*', '', names(x)) == 'low') + 1
x
#high_X17 high_X18 high_X19 high_X20 high_X21 high_X22 high_X23 high_X24
# 1 1 1 1 1 1 1 1
# low_X25 low_X26 low_X27 low_X28 low_X29 low_X30 low_X31 low_X32
# 2 2 2 2 2 2 2 2
sub('_.*', '', names(x)) removes everything after underscore keeping only 'high' and 'low' values.

Using grepl
grepl("low_", names(x)) + 1

Related

Looping through items in dataframe while using apply function

I have a dataframe that has 104 items ("items1" --> "items104"). Each item was administered at different ages, so "items1" for instance is divided into columns items1.12, items1.18, items1.24, items1.30, items1.36. This is the case for all 104 items. I would like to run the following code on each of the 104 items in the dataframe.
ID <- c("4000", "4001", "4006", "4007", "4009", "4010")
items1.12 <- c(1, 1, 1, 1, 1, 1)
items1.18 <- c(1, 1, 1, 1, 1, 1)
items1.24 <- c(1, 1, 1, 1, 1, 1)
items1.30 <- c(1, 1, 1, 1, 1, 1)
items1.36 <- c(1, 1, 1, 1, 1, 1)
items2.12 <- c(2, 2, 1, 1, 2, 1)
items2.18 <- c(2, 2, 1, 1, 2, 1)
items2.24 <- c(2, 2, 1, 1, 2, 1)
items2.30 <- c(2, 2, 1, 1, 2, 1)
items2.36 <- c(2, 2, 1, 1, 2, 1)
wide <- data.frame(ID, items1.12, items1.18, items1.24, items1.30, items1.36, items2.12, items2.18, items2.24, items2.30, items2.36)
ID items1.12 items1.18 items1.24 items1.30 items1.36 items2.12 items2.18 items2.24 items2.30 items2.36
4000 1 1 1 1 1 2 2 2 2 2
4001 1 1 1 1 1 2 2 2 2 2
4006 1 1 1 NA 1 1 1 1 1 1
4007 1 1 1 1 1 1 1 1 1 1
4009 1 1 1 1 1 2 2 2 2 2
4010 1 1 1 1 1 1 1 1 1 1
I would like to run this code for each item. Here is an example for "items1"
wide$items1.new <- apply(!is.na(wide[,paste("items1.", c(12,18,24,30,36), sep = "")]), 1, max)
wide$items1.new.2 <- NA
for(i in unique(wide$ID)){
select <- i == wide$ID
ifelse(wide$items1.new[select] == 0, wide$items1.new.2 [select] <- NA, wide$items1.new.2[select] <- rowMeans(wide[select,c("items1.12", "items1.18", "items1.24", "items1.30", "items1.36")], na.rm = T))}
wide <- subset(wide, select = -c(items1.new, items1.12, items1.18, items1.24, items1.30, items1.36))
names(wide)[names(wide) == 'items1.new.2'] <- "item1"
Here is an example for "items2"
wide$items2.new <- apply(!is.na(wide[,paste("items2.", c(12,18,24,30,36), sep = "")]), 1, max)
wide$items2.new.2 <- NA
for(i in unique(wide$ID)){
select <- i == wide$ID
ifelse(wide$items2.new[select] == 0, wide$items2.new.2 [select] <- NA, wide$items2.new.2[select] <- rowMeans(wide[select,c("items2.12", "items2.18", "items2.24", "items2.30", "items2.36")], na.rm = T))}
wide <- subset(wide, select = -c(items2.new, items2.12, items2.18, items2.24, items2.30, items2.36))
names(wide)[names(wide) == 'items2.new.2'] <- "item2"
Here is what I would like to end with:
ID item1 item2
4000 1 2
4001 1 2
4006 1 1
4007 1 1
4009 1 2
4010 1 1
I would like to do this for items1 to items104 in my dataset. I can't imagine the solution would be very complicated, but I would really appreciate some help as I'm new to R. Thank you so much.
In base R, this can be done with split.default
cbind(wide['ID'], sapply(split.default(wide[-1],
sub("\\.\\d+$", "" , names(wide)[-1])), rowMeans, na.rm = TRUE))
-output
ID items1 items2
1 4000 1 2
2 4001 1 2
3 4006 1 1
4 4007 1 1
5 4009 1 2
6 4010 1 1

ifelse over each element of a vector

Looking at this post, I thought ifelse is vectorized in the sense that f(c(x1, x2, x3)) = c(f(x1), f(x2), f(x3)).
So, I thought if the code for z1 (provided below) will perform the following for each element of the vector y:
Test whether it is unity or not.
If YES, draw a random number from {1, 3, 5, 7, 9}.
If NO, draw a random number from {0, 2, 4, 6, 8}.
But, unfortunately it doesn't do that. It generates once for each case, and returns that very random number always.
Where exactly am I doing wrong? Or, is it actually the expected behaviour of ifelse?
Just to note, if I use this as a wrapper function inside sapply, I get the expected output z2 (in the sense that it is not deterministic as z1 where observing one occurrence of each case is enough), as you can see below.
y <- rbinom(n = 20,
size = 1,
prob = 0.5)
z1 <- ifelse(test = (y == 1),
yes = sample(x = c(1, 3, 5, 7, 9),
size = 1),
no = sample(x = c(0, 2, 4, 6, 8),
size = 1))
z2 <- sapply(X = y,
FUN = function(w)
{
ifelse(test = (w == 1),
yes = sample(x = c(1, 3, 5, 7, 9),
size = 1),
no = sample(x = c(0, 2, 4, 6, 8),
size = 1))
})
data.frame(y, z1, z2)
#> y z1 z2
#> 1 0 2 2
#> 2 1 1 3
#> 3 1 1 9
#> 4 1 1 7
#> 5 0 2 0
#> 6 0 2 2
#> 7 1 1 7
#> 8 1 1 7
#> 9 0 2 0
#> 10 1 1 5
#> 11 0 2 0
#> 12 0 2 0
#> 13 0 2 6
#> 14 0 2 0
#> 15 0 2 2
#> 16 1 1 7
#> 17 1 1 7
#> 18 0 2 2
#> 19 0 2 2
#> 20 0 2 0
unique(x = z1[y == 1])
#> [1] 1
unique(x = z1[y == 0])
#> [1] 2
Created on 2019-03-13 by the reprex package (v0.2.1)
Any help will be appreciated.
ifelse isn't a function of one vector, it is a function of 3 vectors of the same length. The first vector, called test, is a boolean, the second vector yes and third vector no give the elements in the result, chosen item-by-item based on the test value.
A sample of size = 1 is a different size than test (unless the length of test is 1), so it will be recycled by ifelse (see note below). Instead, draw samples of the same size as test from the start:
ifelse(
test = (y == 1),
yes = sample(x = c(1, 3, 5, 7, 9), size = length(y), replace = TRUE),
no = sample(x = c(0, 2, 4, 6, 8), size = lenght(y), replace = TRUE)
)
The vectors don't actually have to be of the same length. The help page ?ifelse explains: "If yes or no are too short, their elements are recycled." This is the behavior you observed with "It generates once for each case, and returns that very random number always.".

R: creating sequence of numbers by group and starting the sequence by a particular condition

I would like to create a new variable, Number, which sequentially generate numbers within a group ID, starting at a particular condition (in this case, when Percent > 5).
groupID <- c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3)
Percent <- c( 3, 4, 5, 10, 2, 1, 6, 8, 4, 8, 10, 11)
Number <- ifelse (Percent < 5, 0, 1:4)
I get:
> Number
[1] 0 0 3 4 0 0 3 4 0 2 3 4
But I'd like:
0 0 1 2 0 0 1 2 0 1 2 3
I did not include groupID variable within the ifelse statement and used 1:4 instead, as there are always 4 rows within each groupID.
Any suggestions or clues? Thank you!
ave(Percent, groupID, FUN=function(x) cumsum(x>=5))
[1] 0 0 1 2 0 0 1 2 0 1 2 3
To the example in the comments below, this is my alternate logical test to be cumsum()-ed:
ave(Percent, groupID, FUN=function(x) cumsum(seq_along(x)>= which(x >=5)[1]) )
It's ugly and throws warnings, but it gets you what you want:
ave(Percent,groupID,FUN=function(x) {x[x<5] <- 0; x[x>=5] <- 1:4; x} )
#[1] 0 0 1 2 0 0 1 2 0 1 2 3
#BondedDust's answer below using cumsum is almost certainly more appropriate though.
If your data was not always in ascending order in each group, you could also replace all the >=5 values like:
Percent <- c( 3, 5, 4, 10, 2, 1, 6, 8, 4, 8, 10, 11)
ave(Percent, list(groupID,Percent>=5), FUN=function(x) cumsum(x>=5))
#[1] 0 1 0 2 0 0 1 2 0 1 2 3
Try this:
ID <- c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3)
Percent <- c( 3, 4, 5, 10, 2, 1, 6, 8, 4, 8, 10, 11)
Number <- Percent >= 5
result = lapply(seq_along(Number), function(i){
if( length(which(! Number[1:i]) ) == 0){start = 1}
else {start =max(which(! Number[1:i]) )}
sum( Number[start : i])
})
> unlist(result)
[1] 0 0 1 2 0 0 1 2 0 1 2 3

Reverse score a vector

I have numeric vectors, such as c(1, 2, 3, 3, 2, 1, 3) or c(1, 4, 1, 4, 4, 1), and I would like to keep individual element's position, but swap/reverse the value, so that we get c(3, 2, 1, 1, 2, 3, 1), c(4, 1, 4, 1, 1, 4) respectively.
To achieve that, I came up with a rather rough and ugly code below with lots of debugging and patching...
blah <- c(1, 4, 1, 4, 4, 1, 3)
blah.uniq <- sort(unique(blah))
blah.uniq.len <- length(blah.uniq)
j <- 1
end <- ceiling(blah.uniq.len / 2)
if(end == 1) {end <- 2} # special case like c(1,4,1), should get c(4,1,4)
for(i in blah.uniq.len:end) {
x <- blah == blah.uniq[i]
y <- blah == blah.uniq[j]
blah[x] <- blah.uniq[j]
blah[y] <- blah.uniq[i]
j = j + 1
}
blah
Is there an easier way to do this?
I think you're trying to reverse score. The algorithm is (1 + max(x_i)) - x_i
so...
x <- c(1, 2, 3, 3, 2, 1, 3)
y <- c(1, 4, 1, 4, 4, 1)
(max(x, na.rm=T) + 1) - x
(max(y, na.rm=T) + 1) - y
yielding:
> (max(x, na.rm=T) + 1) - x
[1] 3 2 1 1 2 3 1
> (max(y, na.rm=T) + 1) - y
[1] 4 1 4 1 1 4
Per the OP's comment:
rev.score <- function(x) {
h <- unique(x)
a <- seq(min(h, na.rm=T), max(h, na.rm=T))
b <- rev(a)
dat <- data.frame(a, b)
dat[match(x, dat[, 'a']), 2]
}
x <- c(1, 2, 3, 3, 2, 1, 3)
rev.score(x)
y <- c(1, 4, 1, 4, 4, 1)
rev.score(y)
z <- c(1, 5, 10, -3, -5, 2)
rev.score(z)
Congratulations! You might have finally found a use for factors , I was still looking for one :-)
x <- c(1, 2, 3, 3, 2, 1, 3)
# [1] 1 2 3 3 2 1 3
y <- factor(x)
# [1] 1 2 3 3 2 1 3
# Levels: 1 2 3
levels(y) <- rev(levels(y))
# [1] 3 2 1 1 2 3 1
# Levels: 3 2 1
Built on that idea, here is a function that returns an object with the same class as the input:
swap <- function(x) {
f <- factor(x)
y <- rev(levels(f))[f]
class(y) <- class(x)
return(y)
}
swap(c(1, 2, 3, 3, 2, 1, 3))
# [1] 3 2 1 1 2 3 1
swap(c(1, 4, 1, 4, 4, 1))
# [1] 4 1 4 1 1 4
A possible generalisable function.
revscore <- function(x) {
rx <- min(x):max(x)
rev(rx)[sapply(1:length(x), function(y) match(x[y],rx))]
}
x1 <- c(-3,-1,0,-2,3,2,1)
x2 <- c(-1,0,1,2)
x3 <- 1:7
Some testing:
> x1
[1] -3 -1 0 -2 3 2 1
> revscore(x1)
[1] 3 1 0 2 -3 -2 -1
> x2
[1] -1 0 1 2
> revscore(x2)
[1] 2 1 0 -1
> x3
[1] 1 2 3 4 5 6 7
> revscore(x3)
[1] 7 6 5 4 3 2 1

Sort list of vectors into single frequency table with a factor column

I have a data frame containing a list vector with jagged entries:
df = data.frame(x = rep(c(1,2), 2), y = rep(c("a", "b"), each = 2))
L = list()
for (each in round(runif(4, 1,5))) L = c(L, list(1:each))
df$L = L
For example,
x y L
1 a 1
2 a 1, 2, 3, 4
1 b 1, 2, 3
2 b 1, 2, 3
How could I create a table which counts the values of L for each x, across the values of y? So, in this example it would output something like,
1 2 3 4
X
1 2 1 1 0
2 2 2 2 1
I had some luck using
tablist = function(L) table(unlist(L))
tapply(df$L, df$x, tablist)
which produces,
$`1`
1 2 3
2 1 1
$`2`
1 2 3 4
2 2 2 1
However, I'm not sure how to go from here to a single table. Also, I'm beggining to suspect that this approach might start taking an unruly amount of time for large data frames. Any thoughts / suggestions would be greatly appreciated!
Using pylr
library(plyr)
df = data.frame(x = rep(c(1,2), 2), y = rep(c("a", "b"), each = 2))
L = list()
set.seed(2)
for (each in round(runif(4, 1,5))) L = c(L, list(1:each))
df$L = L
> df
x y L
1 1 a 1, 2
2 2 a 1, 2, 3, 4
3 1 b 1, 2, 3
4 2 b 1, 2
table(ddply(df,.(x),summarize,unlist(L)))
> table(ddply(df,.(x),summarize,unlist(L)))
..1
x 1 2 3 4
1 2 2 1 0
2 2 2 1 1
If you're not into plyr...
vals <- unique(unlist(df$L))
names(vals) <- vals
do.call("rbind",
lapply(split(df,df$x),function(byx){
sapply(vals, function(i){
sum(unlist(sapply(byx$L,"==",i)))
})
})
)

Resources