calculate the rate under the same in using R - r

I have a question to calculate the rate under the same id numbers.
Here is the sample dataset d:
id answer
1 1
1 0
1 0
1 1
1 1
1 1
1 0
2 0
2 0
2 0
3 1
3 0
The ideal output is
id rate freq
1 4/7 (=0.5714) 7
2 0 3
3 1/2 (=0.5) 2
Thanks.

Just for fun, you can use aggregate
> aggregate(answer~id, function(x) c(rate=mean(x), freq=length(x)), data=df1)
id answer.rate answer.freq
1 1 0.5714286 7.0000000
2 2 0.0000000 3.0000000
3 3 0.5000000 2.0000000

Try
library(data.table)
setDT(df1)[,list(rate= mean(answer), freq=.N) ,id]
# id rate freq
#1: 1 0.5714286 7
#2: 2 0.0000000 3
#3: 3 0.5000000 2
Or
library(dplyr)
df1 %>%
group_by(id) %>%
summarise(rate=mean(answer), freq=n())
data
df1 <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
3L, 3L), answer = c(1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L,
0L)), .Names = c("id", "answer"), class = "data.frame",
row.names = c(NA, -12L))

Related

Sorting data with some similar words in R

I have a database with 100 columns, but a minimal production of my data are as follows:
df1<=read.table(text="PG1S1AW KOM1S1zo PG2S2AW KOM2S2zo PG3S3AW KOM3S3zo PG4S4AW KOM4S4zo PG5S5AW KOM5S5zo
4 1 2 4 4 3 0 4 0 5
4 4 3 1 3 1 0 3 0 1
2 3 5 3 3 2 1 4 0 2
1 1 1 1 1 3 0 5 0 1
2 5 3 4 4 5 0 1 3 4", header=TRUE)
I want to get columns starting with KOM and PG which have a greater of 3 . So we need to have PG4, KOM4 and above. Put it simply, starting with PG and KOM have the same values which is 4 and greater.
The intended output is:
PG4S4AW KOM4S4zo PG5S5AW KOM5S5zo
0 4 0 5
0 3 0 1
1 4 0 2
0 5 0 1
0 1 3 4
I have used the following code, but it does not work for me:
df2<- df1%>% select(contains("KO"))
Thanks for your help.
It is not entirely clear about the patterns. We create a function (f1) to extract one or more digits (\\d+) that follows the 'KOM' or (|) 'PG' with str_extract (from stringr), convert to numeric ('v1'), similarly, extract numbers after the 'S' ('v2'). Do a check whether these values are same and if one of the value is greater than 3, wrap with which so that if there are any NAs resulting from str_extract would be removed as which gives the column index while removing any NAs. Use the function in select to select the columns that follow the pattern
library(dplyr)
library(stringr)
f1 <- function(nm) {
v1 <- as.numeric(str_extract(nm, "(?<=(KOM|PG))\\d+"))
v2 <- as.numeric(str_extract(nm, "(?<=S)\\d+"))
nm[which((v1 == v2) & (v1 > 3))]
}
df1 %>%
select(f1(names(.)))
# PG4S4AW KOM4S4zo PG5S5AW KOM5S5zo
#1 0 4 0 5
#2 0 3 0 1
#3 1 4 0 2
#4 0 5 0 1
#5 0 1 3 4
data
df1 <- structure(list(PG1S1AW = c(4L, 4L, 2L, 1L, 2L), KOM1S1zo = c(1L,
4L, 3L, 1L, 5L), PG2S2AW = c(2L, 3L, 5L, 1L, 3L), KOM2S2zo = c(4L,
1L, 3L, 1L, 4L), PG3S3AW = c(4L, 3L, 3L, 1L, 4L), KOM3S3zo = c(3L,
1L, 2L, 3L, 5L), PG4S4AW = c(0L, 0L, 1L, 0L, 0L), KOM4S4zo = c(4L,
3L, 4L, 5L, 1L), PG5S5AW = c(0L, 0L, 0L, 0L, 3L), KOM5S5zo = c(5L,
1L, 2L, 1L, 4L)), class = "data.frame", row.names = c(NA, -5L
))
Given your example data, you can just instead look for the numbers 4 or 5.
df1 %>%
select(matches("4|5"))
#> KO4S4AW KOM4S4zo KO5S5AW KOM5S5zo
#> 1 0 4 0 5
#> 2 0 3 0 1
#> 3 1 4 0 2
#> 4 0 5 0 1
#> 5 0 1 3 4

How can I combine rows based on a specific parameter in R

I have a dataframe which looks like this:
ID Smoker Asthma Age Sex COPD Event_Date
1 1 0 0 65 M 0 12-2009
2 1 0 1 65 M 0 21-2009
3 1 0 1 65 M 0 23-2009
4 2 1 0 67 M 0 19-2010
5 2 1 0 67 M 0 21-2010
6 2 1 1 67 M 1 01-2011
7 2 1 1 67 M 1 02-2011
8 3 2 1 77 F 0 09-2015
9 3 2 1 77 F 1 10-2015
10 3 2 1 77 F 1 10-2015
I would like to know whether it would be possible it combine my rows in order to achieve a dataset like this:
ID Smoker Asthma Age Sex COPD Event_Data
1 0 1 65 M 0 12-2009
2 1 1 66 M 1 19-2010
3 2 1 77 F 1 09-2015
I have tried using the unique function, however this doesn't give me my desired output and repeats the ID for multiple rows.
This is an example of the code i've tried
Data2<-unique(Data)
I do not just want the first row because I want to include each column status. For example, just getting the first row would not include the COPD status which occurs in the later rows for each ID.
Alternative Solution:
library(dplyr)
d %>%
group_by(ID, Age, Sex, Smoker) %>%
summarise(Asthma = !is.na(match(1, Asthma)),
COPD = !is.na(match(1, COPD)),
Event_Date = first(Event_Date)) %>%
ungroup %>%
mutate_if(is.logical, as.numeric)
# A tibble: 3 x 7
ID Age Sex Smoker Asthma COPD Event_Date
<int> <int> <fct> <int> <dbl> <dbl> <fct>
1 1 65 M 0 1 0 12-2009
2 2 67 M 1 1 1 19-2010
3 3 77 F 2 1 1 09-2015
If you want to get the (first) row for each ID you can try something like this:
d <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L),
Smoker = c(0L, 0L, 0L, 1L, 1L, 1L, 1L, 2L, 2L, 2L),
Asthma = c(0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L),
Age = c(65L, 65L, 65L, 67L, 67L, 67L, 67L, 77L, 77L, 77L),
Sex = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L),
.Label = c("F", "M"), class = "factor"),
COPD = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L),
Event_Date = structure(c(5L, 7L, 9L, 6L, 8L, 1L, 2L, 3L, 4L, 4L),
.Label = c("01-2011", "02-2011", "09-2015",
"10-2015", "12-2009", "19-2010",
"21-2009", "21-2010", "23-2009"),
class = "factor")),
class = "data.frame",
row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"))
d[!duplicated(d$ID), ]
# ID Smoker Asthma Age Sex COPD Event_Date
# 1 1 0 0 65 M 0 12-2009
# 4 2 1 0 67 M 0 19-2010
# 8 3 2 1 77 F 0 09-2015
Use max when you need a value further down and dplyr::first for others, here an example
library(dplyr)
df %>% group_by(ID) %>% summarise(Smoker=first(Smoker), Asthma=max(Asthma, na.rm = TRUE))

Apply function across multiple columns

Please find here a very small subset of a long data.table I am working with
dput(dt)
structure(list(id = 1:15, pnum = c(4298390L, 4298390L, 4298390L,
4298558L, 4298558L, 4298559L, 4298559L, 4299026L, 4299026L, 4299026L,
4299026L, 4300436L, 4300436L, 4303566L, 4303566L), invid = c(15L,
101L, 102L, 103L, 104L, 103L, 104L, 106L, 107L, 108L, 109L, 87L,
111L, 2L, 60L), fid = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 2L,
4L, 4L, 4L, 4L, 3L, 3L, 2L, 2L), .Label = c("CORN", "DowCor",
"KIM", "Texas"), class = "factor"), dom_kn = c(1L, 0L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L), prim_kn = c(1L,
0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L), pat_kn = c(1L,
0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L), net_kn = c(1L,
0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L), age_kn = c(1L,
0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L), legclaims = c(5L,
0L, 0L, 2L, 5L, 2L, 5L, 0L, 0L, 0L, 0L, 5L, 0L, 5L, 2L), n_inv = c(3L,
3L, 3L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L)), .Names = c("id",
"pnum", "invid", "fid", "dom_kn", "prim_kn", "pat_kn", "net_kn",
"age_kn", "legclaims", "n_inv"), class = "data.frame", row.names = c(NA,
-15L))
I am looking to apply a tweaked greater than comparison in 5 different columns.
Within each pnum (patent), there are multiple invid (inventors). I want to compare the values of the columns dom_kn, prim_kn, pat_kn, net_kn, and age_kn per row, to the values in the other rows with the same pnum. The comparison is simply > and if the value is indeed bigger than the other, one "point" should be attributed.
So for the first row pnum == 4298390 and invid == 15, you can see the values in the five columns are all 1, while the values for invid == 101 | 102 are all zero. This means that if we individually compare (is greater than?) each value in the first row to each cell in the second and third row, the total sum would be 10 points. In every single comparison, the value in the first row is bigger and there are 10 comparisons.
The number of comparisons is by design 5 * (n_inv -1).
The result I am looking for for row 1 should then be 10 / 10 = 1.
For pnum == 4298558 the columns net_kn and age_kn both have values 1 in the two rows (for invid 103 and 104), so that each should get 0.5 points (if there would be three inventors with value 1, everyone should get 0.33 points). The same goes for pnum == 4298558.
For the next pnum == 4299026 all values are zero so every comparison should result in 0 points.
Thus note the difference: There are three different dyadic comparisons
1 > 0 --> assign 1
1 = 1 --> assign 1 / number of positive values in column subset
0 = 0 --> assign 0
Desired result
An extra column result in the data.table with values 1 0 0 0.2 0.8 0.2 0.8 0 0 0 0 1 0 0.8 0.2
Any suggestions on how to compute this efficiently?
Thanks!
vars = grep('_kn', names(dt), value = T)
# all you need to do is simply assign the correct weight and sum the numbers up
dt[, res := 0]
for (var in vars)
dt[, res := res + get(var) / .N, by = c('pnum', var)]
# normalize
dt[, res := res/sum(res), by = pnum]
# id pnum invid fid dom_kn prim_kn pat_kn net_kn age_kn legclaims n_inv res
# 1: 1 4298390 15 CORN 1 1 1 1 1 5 3 1.0
# 2: 2 4298390 101 CORN 0 0 0 0 0 0 3 0.0
# 3: 3 4298390 102 CORN 0 0 0 0 0 0 3 0.0
# 4: 4 4298558 103 DowCor 0 0 0 1 1 2 2 0.2
# 5: 5 4298558 104 DowCor 1 1 1 1 1 5 2 0.8
# 6: 6 4298559 103 DowCor 0 0 0 1 1 2 2 0.2
# 7: 7 4298559 104 DowCor 1 1 1 1 1 5 2 0.8
# 8: 8 4299026 106 Texas 0 0 0 0 0 0 4 NaN
# 9: 9 4299026 107 Texas 0 0 0 0 0 0 4 NaN
#10: 10 4299026 108 Texas 0 0 0 0 0 0 4 NaN
#11: 11 4299026 109 Texas 0 0 0 0 0 0 4 NaN
#12: 12 4300436 87 KIM 1 1 1 1 1 5 2 1.0
#13: 13 4300436 111 KIM 0 0 0 0 0 0 2 0.0
#14: 14 4303566 2 DowCor 1 1 1 1 1 5 2 0.8
#15: 15 4303566 60 DowCor 1 0 0 1 0 2 2 0.2
Dealing with the above NaN case (arguably the correct answer), is left to the reader.
Here's a fastish solution using dplyr:
library(dplyr)
dt %>%
group_by(pnum) %>% # group by pnum
mutate_each(funs(. == max(.) & max(.) != 0), ends_with('kn')) %>%
#give a 1 if the value is the max, and not 0. Only for the column with kn
mutate_each(funs(. / sum(.)) , ends_with('kn')) %>%
#correct for multiple maximums
select(ends_with('kn')) %>%
#remove all non kn columns
do(data.frame(x = rowSums(.[-1]), y = sum(.[-1]))) %>%
#make a new data frame with x = rowsums for each indvidual
# and y the colusums
mutate(out = x/y)
#divide by y (we could just use /5 if we always have five columns)
giving your desired output in the column out:
Source: local data frame [15 x 4]
Groups: pnum [6]
pnum x y out
(int) (dbl) (dbl) (dbl)
1 4298390 5 5 1.0
2 4298390 0 5 0.0
3 4298390 0 5 0.0
4 4298558 1 5 0.2
5 4298558 4 5 0.8
6 4298559 1 5 0.2
7 4298559 4 5 0.8
8 4299026 NaN NaN NaN
9 4299026 NaN NaN NaN
10 4299026 NaN NaN NaN
11 4299026 NaN NaN NaN
12 4300436 5 5 1.0
13 4300436 0 5 0.0
14 4303566 4 5 0.8
15 4303566 1 5 0.2
The NaNs come from the groups with no winners, convert them back using eg:
x[is.na(x)] <- 0

Create n data sets from one data set without repetition using stratified sampling

I have a data set train which has say 500 rows, I would like to get a data frame with n columns each containing 500/n values(row numbers without repetition in other columns) basing on stratified sampling of a column in train, say train$y.
I have tried the following but it returns duplicate values,
library(caret)
n <- 10 # I want to divide my data set in to 10 parts
data_partition <- createDataPartition(y = train$y, times = 10,
p = 1/n, list = F)
To summarize with an example,
If I have a data set train with 100 rows and one of the column train$y(value= 0 or 1). I would like to get 10 data sets with 10 rows each from the train and they should be stratified basing on train$y and they should not be seen on other 9 data sets.
Example input:
ID x y
1 1 0
2 2 0
3 3 1
4 1 1
5 2 1
6 4 1
7 4 0
8 4 1
9 3 1
10 1 1
11 2 1
12 3 0
13 4 1
14 5 1
15 6 1
16 10 1
17 9 1
18 3 0
19 7 0
20 8 1
Expected output (4 first column, with details of each set aside)
ID x y sample set 1 set 2 set 3
1 1 0 set 2 ID x y ID x y ID x y
2 2 0 set 3 8 4 1 11 2 1 17 9 1
3 3 1 set 3 9 3 1 12 3 0 5 2 1
4 1 1 set 3 10 1 1 13 4 1 6 4 1
5 2 1 set 3 18 3 0 1 1 0 7 4 0
6 4 1 set 3 19 7 0 14 5 1 2 2 0
7 4 0 set 3 20 8 1 15 6 1 3 3 1
8 4 1 set 1 16 10 1 4 1 1
9 3 1 set 1
10 1 1 set 1
11 2 1 set 2
12 3 0 set 2
13 4 1 set 2
14 5 1 set 2
15 6 1 set 2
16 10 1 set 2
17 9 1 set 3
18 3 0 set 1
19 7 0 set 1
20 8 1 set 1
In the above example given input as ID,x and y. I would like to get the column sample which I can segregate into those 3 tables(to the right) whenever I want to.
Please observe, the y in the data has 14- 1s and 6- 0s which are in the ratio of 70:30 and the output sets are almost in similar ratio.
Sample dataset in a copy/run friendly format:
data <- structure(list(ID = 1:20, x = c(1L, 2L, 3L, 1L, 2L, 4L, 4L, 4L,
3L, 1L, 2L, 3L, 4L, 5L, 6L, 10L, 9L, 3L, 7L, 8L), y = c(0L, 0L,
1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L,
0L, 1L)), .Names = c("ID", "x", "y"), class = "data.frame", row.names = c(NA,
-20L))
It can be done using the caret package. Try the code below
# Createing dataset
data <- structure(list(ID = 1:20, x = c(1L, 2L, 3L, 1L, 2L, 4L, 4L, 4L,
3L, 1L, 2L, 3L, 4L, 5L, 6L, 10L, 9L, 3L, 7L, 8L), y = c(0L, 0L,
1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L,
0L, 1L)), .Names = c("ID", "x", "y"), class = "data.frame", row.names = c(NA, -20L))
# Solution
library(caret)
k <- createFolds(data$y,k = 3,list = F)
addmargins(table(k,data$y))

How to count the number of combinations of boolean data in R

What is the best way to determine a factor or create a new category field based on a number of boolean fields? In this example, I need to count the number of unique combinations of medications.
> MultPsychMeds
ID OLANZAPINE HALOPERIDOL QUETIAPINE RISPERIDONE
1 A 1 1 0 0
2 B 1 0 1 0
3 C 1 0 1 0
4 D 1 0 1 0
5 E 1 0 0 1
6 F 1 0 0 1
7 G 1 0 0 1
8 H 1 0 0 1
9 I 0 1 1 0
10 J 0 1 1 0
Perhaps another way to state it is that I need to pivot or cross tabulate the pairs. The final results need to look something like:
Combination Count
OLANZAPINE/HALOPERIDOL 1
OLANZAPINE/QUETIAPINE 3
OLANZAPINE/RISPERIDONE 4
HALOPERIDOL/QUETIAPINE 2
This data frame can be replicated in R with:
MultPsychMeds <- structure(list(ID = structure(1:10, .Label = c("A", "B", "C",
"D", "E", "F", "G", "H", "I", "J"), class = "factor"), OLANZAPINE = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L), HALOPERIDOL = c(1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L), QUETIAPINE = c(0L, 1L, 1L, 1L,
0L, 0L, 0L, 0L, 1L, 1L), RISPERIDONE = c(0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 0L, 0L)), .Names = c("ID", "OLANZAPINE", "HALOPERIDOL",
"QUETIAPINE", "RISPERIDONE"), class = "data.frame", row.names = c(NA,
-10L))
Here's one approach using the reshape and plyr packages:
library(reshape)
library(plyr)
#Melt into long format
dat.m <- melt(MultPsychMeds, id.vars = "ID")
#Group at the ID level and paste the drugs together with "/"
out <- ddply(dat.m, "ID", summarize, combos = paste(variable[value == 1], collapse = "/"))
#Calculate a table
with(out, count(combos))
x freq
1 HALOPERIDOL/QUETIAPINE 2
2 OLANZAPINE/HALOPERIDOL 1
3 OLANZAPINE/QUETIAPINE 3
4 OLANZAPINE/RISPERIDONE 4
Just for fun, a base R solution (that can be turned into a oneliner :-) ):
data.frame(table(apply(MultPsychMeds[,-1], 1, function(currow){
wc<-which(currow==1)
paste(colnames(MultPsychMeds)[wc+1], collapse="/")
})))
Another way could be:
subset(
as.data.frame(
with(MultPsychMeds, table(OLANZAPINE, HALOPERIDOL, QUETIAPINE, RISPERIDONE)),
responseName="count"
),
count>0
)
which gives
OLANZAPINE HALOPERIDOL QUETIAPINE RISPERIDONE count
4 1 1 0 0 1
6 1 0 1 0 3
7 0 1 1 0 2
10 1 0 0 1 4
It's not an exact way you want it, but is fast and simple.
There is shorthand in plyr package:
require(plyr)
count(MultPsychMeds, c("OLANZAPINE", "HALOPERIDOL", "QUETIAPINE", "RISPERIDONE"))
# OLANZAPINE HALOPERIDOL QUETIAPINE RISPERIDONE freq
# 1 0 1 1 0 2
# 2 1 0 0 1 4
# 3 1 0 1 0 3
# 4 1 1 0 0 1

Resources