data
data=data.frame("student"=c(1,2,3,4,5),
"score1"=c(77,NA,52,99,89),
"score2"=c(95,89,79,89,73),
"score3"=c(92,52,73,64,90),
"score4"=c(84,57,78,81,66),
"score1x"=c(0,NA,0,1,1),
"score2x"=c(1,1,0,1,0),
"score3x"=c(1,0,0,0,1),
"score4x"=c(1,0,0,1,0))
I have data with student id and score1-score4 and hope to create score1x-score4x in a simple fast way. The rule is if say score1 is less than 80 then score1x is 0 otherwise it is 1.
I can do this by: data$score1x=ifelse(score1<80,0,1) but am wondering is there a way to do this for all of them at the same time to create score1x-score4x more quickly?
Try:
cbind(data, (data[, 1:4] < 80) * 1)
data.table solution:
setDT(data)
cols <- paste0("score", 1:4)
data[, paste0(cols, "x") := lapply(.SD, function(x) as.integer(x > 80)), .SDcols = cols]
data
student score1 score2 score3 score4 score1x score2x score3x score4x
1: 1 77 95 92 84 0 1 1 1
2: 2 NA 89 52 57 NA 1 0 0
3: 3 52 79 73 78 0 0 0 0
4: 4 99 89 64 81 1 1 0 1
5: 5 89 73 90 66 1 0 1 0
You could use this dplyr solution which creates new variables for columns with "score" in the name using mutate_at(), then uses rename_at() to change "_x" to "x" at the end of the column names:
library(dplyr)
data[1:5] %>%
mutate_at(vars(contains("score")), list(x = ~as.integer(. > 80))) %>%
rename_at(vars(contains("_x")), ~gsub("_", "", ., fixed = T))
student score1 score2 score3 score4 score1x score2x score3x score4x
1 1 77 95 92 84 0 1 1 1
2 2 NA 89 52 57 NA 1 0 0
3 3 52 79 73 78 0 0 0 0
4 4 99 89 64 81 1 1 0 1
5 5 89 73 90 66 1 0 1 0
Related
I have the following data frame:
| Y | Z |
-----------------
62 0
65 0
59 1
66 0
64 1
64 1
57 0
68 1
59 0
60 0
How can I filter out the Z column so, that all the "leftover values" after the final occurance of the value 1 will be filtered out (in this case all the zeroes after the last 1)? In the case of the above example, the filtered data frame would become:
| Y | Z |
-----------------
62 0
65 0
59 1
66 0
64 1
64 1
57 0
68 1
Also, how could I do the filtering for all the values before the first 1 (filter out all the values which precede it) ..?
You can delete all rows after the last occurrence of a value like this:
library(dplyr)
df %>%
slice(1:max(which(Z == 1)))
Output:
Y Z
1 62 0
2 65 0
3 59 1
4 66 0
5 64 1
6 64 1
7 57 0
8 68 1
Another possible solution:
library(dplyr)
df %>%
filter(!(Z == 0 & data.table::rleid(Z) %>% "%in%"(c(1, max(.)))))
#> Y Z
#> 1 59 1
#> 2 66 0
#> 3 64 1
#> 4 64 1
#> 5 57 0
#> 6 68 1
How can I transpose specific columns in a data.frame as:
id<- c(1,2,3)
t0<- c(0,0,0)
bp0<- c(88,95,79)
t1<- c(15,12,12)
bp1<- c(92,110,82)
t2<- c(25,30,20)
bp2<- c(75,99,88)
df1<- data.frame(id, t0, bp0, t1, bp1, t2, bp2)
df1
> df1
id t0 bp0 t1 bp1 t2 bp2
1 1 0 88 15 92 25 75
2 2 0 95 12 110 30 99
3 3 0 79 12 82 20 88
In order to obtain:
> df2
id t bp
1 1 0 88
2 2 0 95
3 3 0 79
4 1 15 92
5 2 12 110
6 3 12 82
7 1 25 75
8 2 30 99
9 3 20 88
In order to obtain df2, with represent t(t0,t1,t2) and bp(bp0,bp1,bp2) for the corresponding "id"
Using Base R, you can do:
Reprex
Code
df2 <- cbind(df1[1], stack(df1, startsWith(names(df1), "t"))[1], stack(df1,startsWith(names(df1), "bp"))[1])
names(df2)[2:3] <- c("t", "bp")
Output
df2
#> id t bp
#> 1 1 0 88
#> 2 2 0 95
#> 3 3 0 79
#> 4 1 15 92
#> 5 2 12 110
#> 6 3 12 82
#> 7 1 25 75
#> 8 2 30 99
#> 9 3 20 88
Created on 2022-02-14 by the reprex package (v2.0.1)
Here is solution with pivot_longer using name_pattern:
\\w+ = one or more alphabetic charachters
\\d+ = one or more digits
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer (
-id,
names_to = c(".value", "name"),
names_pattern = "(\\w+)(\\d+)"
) %>%
select(-name)
id t bp
<dbl> <dbl> <dbl>
1 1 0 88
2 1 15 92
3 1 25 75
4 2 0 95
5 2 12 110
6 2 30 99
7 3 0 79
8 3 12 82
9 3 20 88
A base R option using reshape
reshape(
setNames(df1, sub("(\\d+)", ".\\1", names(df1))),
direction = "long",
idvar = "id",
varying = -1
)
gives
id time t bp
1.0 1 0 0 88
2.0 2 0 0 95
3.0 3 0 0 79
1.1 1 1 15 92
2.1 2 1 12 110
3.1 3 1 12 82
1.2 1 2 25 75
2.2 2 2 30 99
3.2 3 2 20 88
Can someone think of a more interesting way to combine multiple factor columns into a single numeric column?
MWE dataset:
df <- data.frame(q.82=factor(c(1,2,2,1,1)), q.77=factor(c(2,1,1,1,1)), q.72=factor(c(1,1,1,2,2)))
levels(df$q.82) <- c("","$80 and above")
levels(df$q.77) <- c("", "$75 to $79")
levels(df$q.72) <- c("", "$70 to $74")
str(df$q.82)
Factor w/ 2 levels "","$80 and above": 1 2 2 1 1
df looks like this:
q.82 q.77 q.72
1 $74 to $79
2 $80 and above
3 $80 and above
4 $70 to $74
5 $70 to $74
What I'd like is something like this, where the columns are numeric:
q.82 q.77 q.72 q
1 0 77 0 77
2 82 0 0 82
3 82 0 0 82
4 0 0 72 72
5 0 0 72 72
The following works, but seems klunky—mostly because the actual dataset has many columns.
df$q.82 <- as.numeric(as.factor(df$q.82))
df$q.82[df$q.82 == 2] <- 82
df$q.82[df$q.82 == 1] <- 0
df$q.77 <- as.numeric(as.factor(df$q.77))
df$q.77[df$q.77 == 2] <- 77
df$q.77[df$q.77 == 1] <- 0
df$q.72 <- as.numeric(as.factor(df$q.72))
df$q.72[df$q.72 == 2] <- 72
df$q.72[df$q.72 == 1] <- 0
df <- df %>% mutate(q=q.82+q.77+q.72)
A possible approach with base R using sapply:
For each column, replace non-empty strings by the numeric part of the column name and replace empty strings by zero.
Add an additional column q that contains the summed value of each row.
out_df <- sapply(names(df), function(name) {
ifelse(nchar(as.character(df[[name]])) > 0, as.numeric(sub("^q\\.", "", name)), 0)
})
out_df <- transform(out_df, q = rowSums(out_df))
out_df
#> q.82 q.77 q.72 q
#> 1 0 77 0 77
#> 2 82 0 0 82
#> 3 82 0 0 82
#> 4 0 0 72 72
#> 5 0 0 72 72
Similarly, using the tidyverse:
library(tidyverse)
df_out <- imap_dfc(.x = df, .f = ~{
if_else(nchar(as.character(.x)) > 0, as.numeric(str_remove(.y, "^q\\.")), 0)
}) %>%
mutate(q = rowSums(.))
df_out
#> # A tibble: 5 x 4
#> q.82 q.77 q.72 q
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0 77 0 77
#> 2 82 0 0 82
#> 3 82 0 0 82
#> 4 0 0 72 72
#> 5 0 0 72 72
Or with data.table:
library(data.table)
setDT(df)
for(j in names(df))
set(df, j = j, value = ifelse(nchar(as.character(df[[j]])) > 0, as.numeric(sub("^q\\.", "", j)), 0))
df[, q := rowSums(.SD)][]
#> q.82 q.77 q.72 q
#> 1: 0 77 0 77
#> 2: 82 0 0 82
#> 3: 82 0 0 82
#> 4: 0 0 72 72
#> 5: 0 0 72 72
Data
df <- data.frame(q.82=factor(c(1,2,2,1,1)), q.77=factor(c(2,1,1,1,1)), q.72=factor(c(1,1,1,2,2)))
levels(df$q.82) <- c("","$80 and above")
levels(df$q.77) <- c("", "$75 to $79")
levels(df$q.72) <- c("", "$70 to $74")
Here is another base R method, where we replace non-blank value in the column with the numeric part in the column name using sub.
df[] <- t(as.integer(sub(".*?(\\d+)", "\\1", names(df))) * t(df != ""))
df
# q.82 q.77 q.72
#1 0 77 0
#2 82 0 0
#3 82 0 0
#4 0 0 72
#5 0 0 72
and then if you want to row-wise sum the values you can use rowSums
df$q <- rowSums(df)
I have a data frame with distance in the first colomn and class in the second:
data.tab <- read.table(text = "
644 1
76 1
78 1
350 1
45 1
37 2
366 2
46 2
71 3
28 3
97 3
30 3
55 3
65 3
116 3
30 3
18 4
143 4
99 4")
I want to shape it into a new data frame by adding zero according to the longest class. The result will be:
data.tab <- read.table(text = "
1 644 76 78 350 45 0 0 0
2 37 366 46 0 0 0 0 0
3 71 28 97 30 55 65 116 30
4 18 143 99 0 0 0 0 0")
This essentially boils down to a simple long to wide reshape
library(tidyverse)
data.tab %>%
group_by(V2) %>%
mutate(col = paste0("V", 1:n())) %>%
spread(col, V1, fill = 0) %>%
ungroup()
## A tibble: 4 x 8
# V1 V2 V3 V4 V5 V6 V7 V8
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 644 76 78 350 45 0 0 0
#2 37 366 46 0 0 0 0 0
#3 71 28 97 30 55 65 116 30
#4 18 143 99 0 0 0 0 0
Using df as name instead of data.tab:
MAX <- max(table(df$V2))
t(sapply(split(df$V1, df$V2), function(x) c(x, rep(0, MAX-length(x)))))
(The idea is to split V1 into groups defined by V2, making the vectors equal in length by adding 0's at the end when necessary, and then combining that into a single matrix. sapply does the last bit automatically but columnwise, so t is needed.)
another way using length<-
U <- unstack(df) # a hack learned from G.Grothendieck's answer
U <- with(df, split(V1,V2)) # more readable version of the above
M <- max(lengths(U))
R <- t(sapply(U, "length<-", M)) # setting all lengths equal
replace(R, is.na(R), 0) # replacing NAs by zeroes
And a (rather unreadable) one-liner doing the same thing:
"[<-"(R<-t(sapply(U<-unstack(df),"length<-",max(lengths(U)))),is.na(R),0)
1) xtabs Using only base R create a sequence number column within class and then use xtabs to rearrange it into a table. Finally convert that to data frame. Omit the last line of code if a table is sufficient.
data.tab2 <- transform(data.tab, seq = ave(V2, V2, FUN = seq_along))
xt <- xtabs(V1 ~ V2 + seq, data.tab2)
as.data.frame.matrix(xt)
giving:
1 2 3 4 5 6 7 8
1 644 76 78 350 45 0 0 0
2 37 366 46 0 0 0 0 0
3 71 28 97 30 55 65 116 30
4 18 143 99 0 0 0 0 0
2) ts Another base R solution is to convert the elements of each class to a ts series giving tt a multivariate time series with NAs at the ends of the shorter ones. Convert those NAs to 0 in the second line of code and then convert that to a data frame in the last line.
tt <- do.call("cbind", lapply(unstack(data.tab), ts))
tt[] <- ifelse(is.na(tt), 0, tt)
as.data.frame(t(tt))
3) Using data.tab2 from (1) use tapply to create the matrix mat and then convert that to a data.frame. Omit the last line of code if a matrix is sufficient.
mat <- with(data.tab2, tapply(V1, list(V2, seq), c, default = 0))
as.data.frame(mat)
Note
A comment claimed ifelse would be slower than a suggested alternative but benchmarking it showed no overall difference on the data in the question. Of course performance may not be very important here in the first place.
library(rbenchmark)
benchmark(
ifelse = {
tt <- do.call("cbind", lapply(unstack(data.tab), ts))
tt[] <- ifelse(is.na(tt), 0, tt)
as.data.frame(t(tt))
},
replace = {
tt <- do.call("cbind", lapply(unstack(data.tab), ts))
tt[is.na(tt)] <- 0
as.data.frame(t(tt))
}
)[1:4]
giving:
test replications elapsed relative
1 ifelse 100 0.25 1
2 replace 100 0.25 1
using data.table's transpose
cbind(sort(unique(data.tab$V2)),do.call(rbind,transpose(transpose(split(data.tab$V1, data.tab$V2), 0))))
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
#[1,] 1 644 76 78 350 45 0 0 0
#[2,] 2 37 366 46 0 0 0 0 0
#[3,] 3 71 28 97 30 55 65 116 30
#[4,] 4 18 143 99 0 0 0 0 0
I would like to analyze data.
My Database is composed of 1408 (704 for type 1 and 704 for type 2) observations and 49 variables. Here is part of my database.
The point is that I want to analyze gender of type 1(sellers) who overcharged.
Data
Subject ID Gender Period Matching group Group Type Overcharging
654 1 1 73 1 1 NA
654 1 2 73 1 1 NA
654 1 3 73 1 1 NA
654 1 4 73 1 1 NA
708 0 1 73 1 2 1
708 0 2 73 1 2 0
708 0 3 73 1 2 0
708 0 4 73 1 2 1
435 1 1 73 2 1 NA
435 1 2 73 2 1 NA
435 1 3 73 2 1 NA
435 1 4 73 2 1 NA
546 0 1 73 2 2 0
546 0 2 73 2 2 0
546 0 3 73 2 2 1
546 0 4 73 2 2 0
For example, if you take a look at matching group =73, there are 2 groups (1 and 2).And in each group, there are two types (1 and 2). For each type 1 (seller) we do not have information about what he did (overcharge or not). But we have informations about buyers (type 2) who were overcharged or not.
If I can identify the buyer who were over-treated, then, this means that the seller this buyer is interacting has over-treated the buyer. So all I need to look at is the gender of the seller in the same group as the buyer.
In matching group 73 we know for instance that at period 1 subject 708 was overcharged (the one in group 1). As I know that this men belongs to group 1 and matching group 73, I am able to identify the seller who has overcharged him : subject 654 with gender =1.
In group 2 (matching group 73), we know that at period 3, agent 546 was overcharged. As I know that this men belongs to group 1 and matching group 73, I am able to identify the seller who has overcharged him : subject 435 with gender =1.
....
I would do this for all the observations I have.
However I really don't know how to proceed to code and make this condition on R.
This is what I tried to do, but doesn't fit my needs !
for (matchinggroup[type==1]==matchinggroup[type==2] &
group[type==1]==group[type==2] & period[type==1]==period[type==2])
{
if ((overtreatment==1), na.rm=TRUE)
sum(gender==1[type==1], na.rm=TRUE)
}
The expected output I would like to have is :
sum(overcharging==1[gender==1&type==1])
>3
sum(overcharging==1[gender==0&type==1])
>0
sum(overcharging==0[gender==1&type==1])
>5
sum(overcharging==0[gender==0&type==1])
>0
Not exactly sure what your desired output is, but consider this:
Data <- read.table(header = T,
text = "Subject_ID Gender Period Matching_group Group Type Overcharging
654 1 1 73 1 1 NA
654 1 2 73 1 1 NA
654 1 3 73 1 1 NA
654 1 4 73 1 1 NA
708 0 1 73 1 2 1
708 0 2 73 1 2 0
708 0 3 73 1 2 0
708 0 4 73 1 2 1
435 1 1 73 2 1 NA
435 1 2 73 2 1 NA
435 1 3 73 2 1 NA
435 1 4 73 2 1 NA
546 0 1 73 2 2 0
546 0 2 73 2 2 0
546 0 3 73 2 2 1
546 0 4 73 2 2 0
")
dat1 <- subset(Data, Overcharging==1)
This will find all the Overcharging sellers. And then you could find each matching buyer using this loop:
out <- data.frame()
for(i in 1:nrow(dat1)){
dat2 <- dat1[i,]
df <- Data[Data$Period==dat2$Period & Data$Matching_group==dat2$Matching_group &
Data$Group==dat2$Group & Data$Type==1,]
out <- rbind(out, df)
}
Which will give you:
Subject_ID Gender Period Matching_group Group Type Overcharging
1 654 1 1 73 1 1 NA
4 654 1 4 73 1 1 NA
11 435 1 3 73 2 1 NA
I think "for loop" solution is not suitable in R.
I developed another solution for you with data.table by seperating sellers and buyers, and then joining them.
library(data.table)
Data <- data.table(read.table(header = T,
text = "Subject_ID Gender Period Matching_group Group Type Overcharging
654 1 1 73 1 1 NA
654 1 2 73 1 1 NA
654 1 3 73 1 1 NA
654 1 4 73 1 1 NA
708 0 1 73 1 2 1
708 0 2 73 1 2 0
708 0 3 73 1 2 0
708 0 4 73 1 2 1
435 1 1 73 2 1 NA
435 1 2 73 2 1 NA
435 1 3 73 2 1 NA
435 1 4 73 2 1 NA
546 0 1 73 2 2 0
546 0 2 73 2 2 0
546 0 3 73 2 2 1
546 0 4 73 2 2 0
")
)
Data[, SubjectType := ifelse(Type==1, "Seller", "Buyer")]
Subjects <- unique(Data[, .(Subject_ID, Gender)])
Matches <- dcast(Data, Matching_group+Group~SubjectType, value.var="Subject_ID", fun.aggregate = mean)
Buys <- Data[!is.na(Overcharging), .(Buyer = Subject_ID, BuyerGender = Gender, Period, Matching_group, Group, Overcharging)]
Buys <- merge(Buys, Matches, by=c("Buyer", "Matching_group", "Group"), all.x = T)
Buys <- merge(Buys, Subjects[, .(Seller = Subject_ID, SellerGender = Gender)], by="Seller", all.x = T)
Buys[Overcharging==0, .N, .(BuyerGender, SellerGender)]
Buys[Overcharging==1, .N, .(BuyerGender, SellerGender)]