define a new column - r

I want to define a column which is 1 for rows with col3==loop it is 2 after col3==loop until they get to a row with col3==loop or col3==a or col3==b. otherwise 0.
example
group1 group2 col3
1 1 a
1 1 loop
1 1 d
1 1 c
1 1 b
1 1 f
1 2 loop
1 2 a
1 2 loop
1 2 f
2 1 loop
2 1 g
group1 group2 col3 loop
1 1 a 0
1 1 loop 1
1 1 d 2
1 1 c 2
1 1 b 2
1 1 f 0
1 2 loop 1
1 2 a 2
1 2 h 0
1 2 f 0
2 1 b 0
2 1 g 0
let me know if it is not clear

The following code does what the question asks for. It uses index vectors to assign 1 and 2 to the appropriate places in the new column. I have read in the expected output and created column loop2 to compare the result with the posted column loop. Change this after checking if the answer is right.
i1 <- which(df1$col3 == "loop")
i2 <- which(df1$col3 %in% c("a", "b"))
df1$loop2 <- 0
df1$loop2[i1] <- 1
k <- Map(`:`, (i1 + 1), i2[findInterval(i1, i2) + 1])
df1$loop2[unlist(k)] <- 2
df1
# group1 group2 col3 loop loop2
#1 1 1 a 0 0
#2 1 1 loop 1 1
#3 1 1 d 2 2
#4 1 1 c 2 2
#5 1 1 b 2 2
#6 1 1 f 0 0
#7 1 2 loop 1 1
#8 1 2 a 2 2
#9 1 2 h 0 0
#10 1 2 f 0 0
#11 2 1 b 0 0
#12 2 1 g 0 0
Data.
df1 <- read.table(text = "
group1 group2 col3 loop
1 1 a 0
1 1 loop 1
1 1 d 2
1 1 c 2
1 1 b 2
1 1 f 0
1 2 loop 1
1 2 a 2
1 2 h 0
1 2 f 0
2 1 b 0
2 1 g 0
", header = TRUE)

Related

R: df header columns are ordinal ranking and spread across columns for each observation

I have a questionnaire data that look like below:
items no_stars1 no_stars2 no_stars3 average satisfied bad
1 A 1 0 0 0 0 1
2 B 0 1 0 1 0 0
3 C 0 0 1 0 1 0
4 D 0 1 0 0 1 0
5 E 0 0 1 1 0 0
6 F 0 0 1 0 1 0
7 G 1 0 0 0 0 1
Basically, the header columns (no. of stars rating and satisfactory) are ordinal ranking for each Items. I would like to summarize the no_stars(col 2:4) and satisfactory(col 5:7) into one column so that the output would look like this :
items no_stars satisfactory
1 A 1 1
2 B 2 2
3 C 3 3
4 D 2 3
5 E 3 2
6 F 3 3
7 G 1 1
$no_stars <- 1 is for no_stars1, 2 for no_stars2, 3 for no_stars3
$satisfactory <- 1 is for bad, 2 for average, 3 for good
I have tried the code below
df$no_stars2[df$no_stars2 == 1] <- 2
df$no_stars3[df$no_stars3 == 1] <- 3
df$average[df$average == 1] <- 2
df$satisfied[df$satisfied == 1] <- 3
no_stars <- df$no_stars1 + df$no_stars2 + df$no_stars3
satisfactory <- df$bad + df$average + df$satisfied
tidy_df <- data.frame(df$Items, no_stars, satisfactory)
tidy_df
Is there any function in R that can do the same thing? or
anyone got better and simpler solution ?
Thanks
Just use max.col and set preferences:
starsOrder<-c("no_stars1","no_stars2","no_stars3")
satOrder<-c("bad","average","satisfied")
data.frame(items=df$items,no_stars=max.col(df[,starsOrder]),
satisfactory=max.col(df[,satOrder]))
# items no_stars satisfactory
#1 A 1 1
#2 B 2 2
#3 C 3 3
#4 D 2 3
#5 E 3 2
#6 F 3 3
#7 G 1 1
Another tidyverse solution making use of factor to integer conversions to encode no_stars and satisfactory and spreading from wide to long twice:
library(tidyverse)
df %>%
gather(no_stars, v1, starts_with("no_stars")) %>%
mutate(no_stars = as.integer(factor(no_stars))) %>%
gather(satisfactory, v2, average, satisfied, bad) %>%
filter(v1 > 0 & v2 > 0) %>%
mutate(satisfactory = as.integer(factor(
satisfactory, levels = c("bad", "average", "satisfied")))) %>%
select(-v1, -v2) %>%
arrange(items)
# items no_stars satisfactory
#1 A 1 1
#2 B 2 2
#3 C 3 3
#4 D 2 3
#5 E 3 2
#6 F 3 3
#7 G 1 1
While there may be more elegant solutions, using dplyr::case_when() gives you the flexibility to code things however you want:
library(dplyr)
df %>%
dplyr::mutate(
no_stars = dplyr::case_when(
no_stars1 == 1 ~ 1,
no_stars2 == 1 ~ 2,
no_stars3 == 1 ~ 3)
, satisfactory = dplyr::case_when(
average == 1 ~ 2,
satisfied == 1 ~ 3,
bad == 1 ~ 1)
)
# items no_stars1 no_stars2 no_stars3 average satisfied bad no_stars satisfactory
# 1 A 1 0 0 0 0 1 1 1
# 2 B 0 1 0 1 0 0 2 2
# 3 C 0 0 1 0 1 0 3 3
# 4 D 0 1 0 0 1 0 2 3
# 5 E 0 0 1 1 0 0 3 2
# 6 F 0 0 1 0 1 0 3 3
# 7 G 1 0 0 0 0 1 1 1
dat%>%
replace(.==1,NA)%>%
replace_na(setNames(as.list(names(.)),names(.)))%>%
replace(.==0,NA)%>%
mutate(s=coalesce(!!!.[2:4]),
no_stars=as.numeric(factor(s,unique(s))),
t=coalesce(!!!.[5:7]),
satisfactory=as.numeric(factor(t,unique(t))))%>%
select(items,no_stars,satisfactory)
items no_stars satisfactory
1 A 1 1
2 B 2 2
3 C 3 3
4 D 2 3
5 E 3 2
6 F 3 3
7 G 1 1
using apply and match :
data.frame(
items = df1$items,
no_stars = apply(df1[2:4], 1, match, x=1),
satisfactory = apply(df1[c(7,5:6)], 1, match, x=1))
# items no_stars satisfactory
# 1 A 1 1
# 2 B 2 2
# 3 C 3 3
# 4 D 2 3
# 5 E 3 2
# 6 F 3 3
# 7 G 1 1
data
df1 <- read.table(header=TRUE,stringsAsFactors=FALSE,text="
items no_stars1 no_stars2 no_stars3 average satisfied bad
1 A 1 0 0 0 0 1
2 B 0 1 0 1 0 0
3 C 0 0 1 0 1 0
4 D 0 1 0 0 1 0
5 E 0 0 1 1 0 0
6 F 0 0 1 0 1 0
7 G 1 0 0 0 0 1")

dataframes in R with multi-level rows and columns

Let's say I have the following data frame.
> df = data.frame(rowsA = sample(c('A','B','C'), 100, replace=TRUE),
rowsB = sample(c('D','E','F'), 100, replace=TRUE),
colsA = sample(c('G','H','I'), 100, replace=TRUE),
colsB = sample(c('J','K','L'), 100, replace=TRUE))
> head(df)
rowsA rowsB colsA colsB
1 B E I L
2 A E G J
3 A E H K
4 A D I J
5 C F G J
6 A F G J
Is it possible to create a multi-level table of counts?
In excel, it is possible with the PivotTable functionality
I think it possible in python in pandas with the df.columns.levels method.
I also figured out how to do multi-level rows only in R with dplyr (but haven't figured out multi-level columns)
df %>%
group_by(rowsA, rowsB, colsA) %>%
summarise(count = n()) %>%
spread(colsA, count)
# A tibble: 9 x 5
# Groups: rowsA, rowsB [9]
rowsA rowsB G H I
* <fctr> <fctr> <int> <int> <int>
1 A D 5 3 1
2 A E 1 2 1
3 A F 5 8 NA
4 B D 5 5 5
5 B E 2 4 6
6 B F 4 6 5
7 C D 2 6 NA
8 C E 6 5 3
9 C F 4 3 3
Paste the columns that goes to the header into one column, then reshape it, in which way you have a contingency table with the same meaning as the multi level count:
library(dplyr); library(tidyr)
df %>%
unite(header, c('colsA', 'colsB')) %>%
count(rowsA, rowsB, header) %>%
spread(header, n, fill = 0)
# A tibble: 9 x 11
# rowsA rowsB G_J G_K G_L H_J H_K H_L I_J I_K I_L
#* <fctr> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 A D 1 0 0 0 3 1 1 1 0
#2 A E 2 0 0 1 1 0 0 0 1
#3 A F 5 0 0 3 2 1 0 1 1
#4 B D 0 1 1 1 0 3 1 1 1
#5 B E 2 2 1 3 1 1 0 3 1
#6 B F 1 1 2 3 3 0 1 2 1
#7 C D 0 2 3 1 2 0 4 3 2
#8 C E 2 2 2 1 2 0 0 1 1
#9 C F 1 0 1 2 0 1 2 1 2
Or if you are OK with a table/array/matrix as result, you can use xtabs, (borrowed from this answer), which essentially gives a 4-d array but with ftable, it can be displayed as you need:
ftable(xtabs(data = df), row.vars = 1:2, col.vars = 3:4)
# colsA G H I
# colsB J K L J K L J K L
#rowsA rowsB
#A D 1 0 0 0 3 1 1 1 0
# E 2 0 0 1 1 0 0 0 1
# F 5 0 0 3 2 1 0 1 1
#B D 0 1 1 1 0 3 1 1 1
# E 2 2 1 3 1 1 0 3 1
# F 1 1 2 3 3 0 1 2 1
#C D 0 2 3 1 2 0 4 3 2
# E 2 2 2 1 2 0 0 1 1
# F 1 0 1 2 0 1 2 1 2

Ifelse statment across multiple rows

Looking to add a column based on the values of two columns, but over more than one row.
Example Dataset Code:
A = c(1,1,1,2,2,2,3,3,3,4,4)
B = c(1,2,3,1,2,3,1,2,3,1,2)
C = c(0,0,0,1,0,0,1,1,1,0,1)
data <- data.frame(A,B,C)
Dataset:
A B C
1 1 1 0
2 1 2 0
3 1 3 0
4 2 1 1
5 2 2 0
6 2 3 0
7 3 1 1
8 3 2 1
9 3 3 1
10 4 1 0
11 4 2 1
Ifelse statements:
What I am trying to achieve is "Create column D.If column C == 1 in any row where column A == x, column D = 1. Else column D == 0"
Desired Output:
A B C D
1 1 1 0 0
2 1 2 0 0
3 1 3 0 0
4 2 1 1 1
5 2 2 0 1
6 2 3 0 1
7 3 1 1 1
8 3 2 1 1
9 3 3 1 1
10 4 1 0 1
11 4 2 1 1
What I've done:
I've thought about it today but can't come up with a logical answer, I've tried looking at the data in long and wide formats but nothings jumped out.
Note:
In actual application the number of times x appears in column C is not equal (some contain one repeat in the dataset, others contain 20).
# just check using any() if any group has a single row with C==1
library(dplyr)
data %>% group_by(A) %>% mutate(D = as.numeric(any(C==1)))
library(data.table)
data[, D:=as.numeric(any(C==1)), by = .(A)]
# A B C D
#1 1 1 0 0
#2 1 2 0 0
#3 1 3 0 0
#4 2 1 1 1
#5 2 2 0 1
#6 2 3 0 1
#7 3 1 1 1
#8 3 2 1 1
#9 3 3 1 1
#10 4 1 0 1
#11 4 2 1 1
Easy with data.table
library(data.table)
data <- data.table(data)
x=2
data[,D:=ifelse(!A==x,ifelse(C==1,1,0),0)]
data
We can use ave from base R
data$D <- with(data, as.integer(ave(C==1, A, FUN=any)))
data
# A B C D
#1 1 1 0 0
#2 1 2 0 0
#3 1 3 0 0
#4 2 1 1 1
#5 2 2 0 1
#6 2 3 0 1
#7 3 1 1 1
#8 3 2 1 1
#9 3 3 1 1
#10 4 1 0 1
#11 4 2 1 1

Conditional variable using R code

I have a data set named "dats".
id y i j
1 0 1 1
1 0 1 2
1 0 1 3
2 1 2 1
2 1 2 2
2 1 2 3
I want to calculate, a new variable ynew=(yij-1*yij) based on (y11*y12, y12*y13....so on). I have tried in this way:
ynew <- NULL
for(p in 1)
{
for (q in ni)
{
ynew[p,q] <- dats$y[dats$i==p & dats$j==q-1]*dats$y[dats$i==p & dats$j==q]
}
}
ynew
But it showing error!
Expected output
id y i j ynew
1 0 1 1 NA
1 0 1 2 0
1 0 1 3 0
2 1 2 1 NA
2 1 2 2 1
2 1 2 3 1
Could anybody help? TIA
Using dplyr and rollapply from zoo package,
library(dplyr)
library(zoo)
dats %>%
group_by(id) %>%
mutate(ynew = c(NA, rollapply(y, 1, by = 2, prod)))
#Source: local data frame [6 x 5]
#Groups: id [2]
# id y i j ynew
# (int) (int) (int) (int) (dbl)
#1 1 0 1 1 NA
#2 1 0 1 2 0
#3 1 0 1 3 0
#4 2 1 2 1 NA
#5 2 1 2 2 1
#6 2 1 2 3 1
May be we need to just multiply with the lag of 'y' grouped by 'id'
library(data.table)
setDT(dats)[, ynew := y * shift(y), by = id]
dats
# id y i j ynew
#1: 1 0 1 1 NA
#2: 1 0 1 2 0
#3: 1 0 1 3 0
#4: 2 1 2 1 NA
#5: 2 1 2 2 1
#6: 2 1 2 3 1
It could also be done with roll_prod
library(RcppRoll)
setDT(dats)[, ynew := c(NA, roll_prod(y, 2)), by = id]
dats
# id y i j ynew
#1: 1 0 1 1 NA
#2: 1 0 1 2 0
#3: 1 0 1 3 0
#4: 2 1 2 1 NA
#5: 2 1 2 2 1
#6: 2 1 2 3 1

How can i count occurrence with few variables in R

I have some example data.frame:
x<- data.frame(c(0,1,2,1,2,1,2),c(0,1,2,1,2,2,1),c(0,1,2,1,2,1,2),c(0,1,2,1,2,2,1))
colnames(x) <- c('PV','LA','Wiz','LAg')
I want to count occurrence by hole row. The result should look like:
PV LA Wiz Lag Replace
0 0 0 0 1
1 1 1 1 2
2 2 2 2 2
1 2 1 2 1
2 1 2 1 1
The row 0 0 0 0 was replaced 1, row 1 1 1 1 was replaced 2 times etc.
Do you have any idea, how can I do it ?
Maybe you want this?
as.data.frame(table(do.call(paste, x[,-1])))
# Var1 Freq
#1 0 0 0 0 1
#2 1 1 1 1 2
#3 1 2 1 2 1
#4 2 1 2 1 1
#5 2 2 2 2 2

Resources