Creating a new variable with if statement in R - r

I’ve the following situation
In my dataset, there are two variables (x1 and x2).
mydata <- data.frame(x1=c("a","b","c","n","a","d","b","l","a","c","t","a","b","d","c","l","n","b"), x2=c(1,NA,5,2,NA,5,4,NA,2,NA,2,6,NA,6,NA,2,6,NA))
mydata[is.na(mydata)] <- " "
x1 x2
1 a 1
2 b
3 c 5
4 n 2
5 a
6 d 5
7 b 4
8 l
9 a 2
10 c
11 t 2
12 a 6
13 b
14 d 6
15 c
16 l 2
17 n 6
18 b
I want to create a third variable, let’s call it x3, which is defined as
If (x1 = a or b or c AND x2= “ ”), then x3=1, else x3=0
The output is
x1 x2 x3
1 a 1 0
2 b 1
3 c 5 0
4 n 2 0
5 a 1
6 d 5 0
7 b 4 0
8 l 0
9 a 2 0
10 c 1
11 t 2 0
12 a 6 0
13 b 1
14 d 6 0
15 c 1
16 l 2 0
17 n 6 0
18 b 1

You can try
mydata$x3 <- (mydata$x1 %in% c('a', 'b', 'c') & mydata$x2==' ')+0
mydata$x3
#[1] 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 1
Or as commented by #David Arenburg, other options to create a column include functions such as transform, within etc
transform(mydata, x3 = (x1 %in% letters[1:3] & x2 == ' ') + 0L)

Related

How to mark rows in groups between row numbers based on value?

here is a table example:
dt <- data.frame(cat = rep(c("A", "B", "C"), c(10, 8, 10)), value=c(0,0,0,3,5,0,8,0,0,0,0,0,2,2,3,0,1,0,0,0,0,0,1,2,3,4,0,0))
dt
cat value
1 A 0
2 A 0
3 A 0
4 A 3
5 A 5
6 A 0
7 A 8
8 A 0
9 A 0
10 A 0
11 B 0
12 B 0
13 B 2
14 B 2
15 B 3
16 B 0
17 B 1
18 B 0
19 C 0
20 C 0
21 C 0
22 C 0
23 C 1
24 C 2
25 C 3
26 C 4
27 C 0
28 C 0
What I would like to do is to flag rows between first and last value which is different than 0 for each group (and also those after last value marked in the other way). So, the final table should look like:
cat value flag
1 A 0 0
2 A 0 0
3 A 0 0
4 A 3 1
5 A 5 1
6 A 0 1
7 A 8 1
8 A 0 2
9 A 0 2
10 A 0 2
11 B 0 0
12 B 0 0
13 B 2 1
14 B 2 1
15 B 3 1
16 B 0 1
17 B 1 1
18 B 0 2
19 C 0 0
20 C 0 0
21 C 0 0
22 C 0 0
23 C 1 1
24 C 2 1
25 C 3 1
26 C 4 1
27 C 0 2
28 C 0 2
Thanks a lot in advance,
Does this work:
library(dplyr)
dt %>% group_by(cat) %>% mutate(c1 = cumsum(value)) %>%
mutate(flat = case_when(c1 == 0 ~ 0,
c1 == max(c1) & value == 0 ~ 2,
TRUE ~ 1)) %>%
select(1,2,4) %>% print(n = 50)
# A tibble: 28 x 3
# Groups: cat [3]
cat value flat
<chr> <dbl> <dbl>
1 A 0 0
2 A 0 0
3 A 0 0
4 A 3 1
5 A 5 1
6 A 0 1
7 A 8 1
8 A 0 2
9 A 0 2
10 A 0 2
11 B 0 0
12 B 0 0
13 B 2 1
14 B 2 1
15 B 3 1
16 B 0 1
17 B 1 1
18 B 0 2
19 C 0 0
20 C 0 0
21 C 0 0
22 C 0 0
23 C 1 1
24 C 2 1
25 C 3 1
26 C 4 1
27 C 0 2
28 C 0 2
Write a function which assign 0, 1 and 2 value based on condition.
library(dplyr)
assign_flag <- function(x) {
#First non-zero value
first <- match(TRUE, x > 0)
#last non-zero value
last <- which.max(cumsum(x))
case_when(row_number() < first ~ 0,
row_number() <= last ~ 1,
TRUE ~ 2)
}
and apply it for each group.
dt %>%
group_by(cat) %>%
mutate(flag = assign_flag(value)) %>%
ungroup
# cat value flag
#1 A 0 0
#2 A 0 0
#3 A 0 0
#4 A 3 1
#5 A 5 1
#6 A 0 1
#7 A 8 1
#8 A 0 2
#9 A 0 2
#10 A 0 2
#11 B 0 0
#12 B 0 0
#13 B 2 1
#14 B 2 1
#15 B 3 1
#16 B 0 1
#17 B 1 1
#18 B 0 2
#19 C 0 0
#20 C 0 0
#21 C 0 0
#22 C 0 0
#23 C 1 1
#24 C 2 1
#25 C 3 1
#26 C 4 1
#27 C 0 2
#28 C 0 2
A base R solution
dt_split = lapply( split(dt, f = dt$cat), function(x){
# Find nonzero elements
flag_tmp = which(x$value!=0)
# Define flags
x$flag =c(rep(0,flag_tmp[1]-1), # The leading zeros
rep(1,tail(flag_tmp, n=1)+1 - flag_tmp[1]), # The nonzero flag
rep(2, nrow(x) -tail(flag_tmp, n=1)) # The trailing zero flag
)
x
})
dt = do.call(rbind, dt_split)

Count consecutive numbers

I have some time series with corresponding number for each date as 0 or 1. For example:
date value
1 0
2 0
3 1
4 1
5 1
6 0
7 1
8 1
So I want to count the consecutive 1´s like for date 3-5 the sum should be 3 and then start at date 7 again to count. And if this sum is below 6 the 1´s should be transformed to 0´s.
library(dplyr)
data.frame(
date = 1:8,
value = c(0,0,1,1,1,0,1,1)
) %>%
mutate(
count = rle(value) %>%
{list(.$lengths * .$values, .$lengths)} %>%
{rep(x = .[[1]], times = .[[2]])},
count_1 = ifelse(count < 6, 0, count)
)
gives:
date value count count_1
1 1 0 0 0
2 2 0 0 0
3 3 1 3 0
4 4 1 3 0
5 5 1 3 0
6 6 0 0 0
7 7 1 2 0
8 8 1 2 0
I would first create a grouping variable and then use this to aggregate the dataset.
d = data.frame("date"=1:12,
"value"=c(1,1,0,0,1,1,1,1,0,0,1,0))
d$group = 1
for(i in 2:dim(d)[1]){
if(d$value[i]==d$value[i-1]){
d$group[i]=d$group[i-1]
} else {
d$group[i]=d$group[i-1]+1
}
}
nd = data.frame("Group"=unique(d$group),
"Start"=aggregate(d$date~d$group,FUN=min)[,2],
"End"=aggregate(d$date~d$group,FUN=max)[,2],
"Count"=aggregate(d$value~d$group,FUN=sum)[,2])
The output for this data would be:
> d ## Input data
date value
1 1 1
2 2 1
3 3 0
4 4 0
5 5 1
6 6 1
7 7 1
8 8 1
9 9 0
10 10 0
11 11 1
12 12 0
> nd ## All groups
Group Start End Count
1 1 1 2 2
2 2 3 4 0
3 3 5 8 4
4 4 9 10 0
5 5 11 11 1
6 6 12 12 0
> nd[nd$Count>0,] ## Just the groups with 1 in them:
Group Start End Count
1 1 1 2 2
3 3 5 8 4
5 5 11 11 1
Another solution which looks like what you expected :
d = data.frame("date"=1:20,"value"=c(1,1,0,0,1,1,1,1,0,0,1,0,1,1,1,1,1,1,1,0))
repl <- rle(d$value)
rep_lengths <- rep(repl$lengths, repl$lengths)
rep_lengths[rep_lengths < 6] <- 0
d$value <- rep_lengths
returns
> d
date value
1 1 0
2 2 0
3 3 0
4 4 0
5 5 0
6 6 0
7 7 0
8 8 0
9 9 0
10 10 0
11 11 0
12 12 0
13 13 7
14 14 7
15 15 7
16 16 7
17 17 7
18 18 7
19 19 7
20 20 0
You can use rle to count the consecutive and use ifelse to set those lower 6 to 0:
y <- rle(x$value)
y[[2]] <- y[[1]] * y[[2]]
y[[2]] <- ifelse(y[[2]] < 6, 0, y[[2]])
inverse.rle(y)
#[1] 0 0 0 0 0 0 0 0
Data:
x <- data.frame(date = 1:8, value = c(0,0,1,1,1,0,1,1))

add column with total count of rows meeting a condition in dplyr

Trying to get totals by class and condition but not grouping data.
Reproducible example:
df <- data.frame("class" = c("a","b","c","d","b","b","b","b","c","c","a"),"increment" = c(0,0,0,0,0,0,32,12,0,0,0))
R> df
class increment
1 a 0
2 b 0
3 c 0
4 d 0
5 b 0
6 b 0
7 b 32
8 b 12
9 c 0
10 c 0
11 a 0
I want the total cases where increment is different from Zero but for every class.
Desired output:
R> df
class increment increment_count_per_class
1 a 0 0
2 b 0 2
3 c 0 0
4 d 0 0
5 b 0 2
6 b 0 2
7 b 32 2
8 b 12 2
9 c 0 0
10 c 0 0
11 a 0 0
My first approach is here below, but I know there must be a less convoluted way using dplyr:
df <- df %>% mutate(has.increment = ifelse(increment>0,1,0))
R> df
class increment has.increment
1 a 0 0
2 b 0 0
3 c 0 0
4 d 0 0
5 b 0 0
6 b 0 0
7 b 32 1
8 b 12 1
9 c 0 0
10 c 0 0
11 a 0 0
Get totals per class when increment exists
N <- df %>% group_by(class,has.increment) %>% tally() %>% filter(has.increment == 1)
R> N
# A tibble: 1 x 3
# Groups: class [1]
class has.increment n
<chr> <dbl> <int>
1 b 1 2
Then join:
merge(N,df, by = "class", all = TRUE)
R> merge(N,df, by = "class", all = TRUE)
class has.increment.x n increment has.increment.y
1 a NA NA 0 0
2 a NA NA 0 0
3 b 1 2 0 0
4 b 1 2 12 1
5 b 1 2 0 0
6 b 1 2 0 0
7 b 1 2 32 1
8 c NA NA 0 0
9 c NA NA 0 0
10 c NA NA 0 0
11 d NA NA 0 0
Try this:
df %>%
group_by(class) %>%
mutate(increment_count_per_class = sum(increment!=0))

How can I count occurences of different integer values (in a particular column) for rows with different factor values(another column)?

I have a dataframe which looks something like this:
My_Data = data.frame(name = rep(LETTERS[1:10],3), number = sample(0:3,30, replace=TRUE)
name number
1 A 3
2 B 3
3 C 0
4 D 3
5 E 2
6 F 2
7 G 2
8 H 2
9 I 1
10 J 3
11 A 1
12 B 2
13 C 0
14 D 1
15 E 3
16 F 0
17 G 2
18 H 2
19 I 2
20 J 2
21 A 0
22 B 1
23 C 3
24 D 0
25 E 2
26 F 0
27 G 1
28 H 1
29 I 3
30 J 0
Now I would like to get a dataframe which has columns for each of the possible values in the number column and the count of the occurences for each of the number values with respect to each value in the name column
name number_0 number_1 number_2 number_3
1 A 1 1 0 1
2 B 0 1 1 1
3 C 2 0 0 1
4 D 1 1 0 1
5 E 0 0 2 1
6 F 2 0 1 0
7 G 0 1 2 0
8 H 0 1 2 0
9 I 0 1 1 1
10 J 1 0 1 1
How can I do that?
Thanks!
Edit: I am not looking for a conversion to the wide format. I am looking for a way to count occurences for each of the possible values.
You can also use xtabs() function.
xtabs(~My_Data$name + My_Data$number)
We could get the count and then spread to 'wide' format
library(dplyr)
library(tidyr)
My_Data %>%
count(name, number) %>%
mutate(number = paste('number', number, sep='_')) %>%
spread(number, n, fill = 0)
# A tibble: 10 x 5
# name number_0 number_1 number_2 number_3
# * <chr> <dbl> <dbl> <dbl> <dbl>
# 1 A 1 1 0 1
# 2 B 0 1 1 1
# 3 C 2 0 0 1
# 4 D 1 1 0 1
# 5 E 0 0 2 1
# 6 F 2 0 1 0
# 7 G 0 1 2 0
# 8 H 0 1 2 0
# 9 I 0 1 1 1
#10 J 1 0 1 1
Try also:
table(My_Data)
or, if you need a data.frame:
as.data.frame.matrix(table(My_Data))

Generate matrix in RevoScaleR from a data frame

I have a data frame as below:
group sex age
A M 15
A F 17
A M 12
A F 2
A F 6
A M 3
A M 10
A M 18
B F 16
B M 6
B M 18
B M 15
B F 8
B F 17
B M 18
B M 16
B F 13
B F 5
B F 13
B F 4
B M 15
B M 8
B M 18
C F 7
C M 12
C M 3
C F 1
C F 9
C F 2
expected result for this data frame.
A B C
A 0 4 3
B 4 0 0
C 3 0 0
I would like to generate a matrix showing the similarity among "group" in input data, based on the "age". For example, if group A and group B have 2 similar ages, then the common element A and B will be 2.
One solution with outer:
library(magrittr)
func = Vectorize(function(u,v)
{
if(all(u==v)) return(0)
intersect(subset(df, group==u)$age, subset(df, group==v)$age) %>% unique %>% length
})
x = df$group %>% unique
m = outer(x, x, func)
row.names(m) = colnames(m) = x
#>m
# A B C
#A 0 4 3
#B 4 0 0
#C 3 0 0
We could merge the dataset ("df") to itself by "age" on a subset of dataset ("df[-2]", ie. the second column is removed), remove the rows that are the same for "group.x" and "group.y", and reshape the unique dataset ("df1") from "long" to "wide" using acast.
df1 <- subset(merge(df[-2], df[-2], by.x='age',
by.y='age'), group.x!=group.y)
library(reshape2)
acast(unique(df1), group.x~group.y, value.var='age')
# A B C
#A 0 4 3
#B 4 0 0
#C 3 0 0
Or use xtabs from base R
xtabs(~group.x+group.y, unique(df1))
# group.y
#group.x A B C
# A 0 4 3
# B 4 0 0
# C 3 0 0
Update
Regarding the new dataset/expected result, it is not clear which column should be included in the relationship with "re". Here, I used "pro_id" to get the expected result.
tbl <- crossprod(table(df[c(3,1)]))
diag(tbl) <- 0
tbl
# re
#re 144 205 209 222 235 250
# 144 0 1 2 0 0 0
# 205 1 0 1 0 0 0
# 209 2 1 0 0 0 0
# 222 0 0 0 0 0 1
# 235 0 0 0 0 0 0
# 250 0 0 0 1 0 0

Resources