Automate Data Frame Element Division - r

I have a dataframe, from which I want to obtain percent treated from the dataset // where % treated = Treated / Total visits
eg. % treated Acute Maxillary Sinusitis = 93470/93470 = 100%
dput(droplevels(head(magma)))
structure(list(DIAG_CODE_1 = structure(c(1L, 1L, 2L, 2L, 2L,
2L), .Label = c("4610 SINUSITIS MAXILLARY ACUT", "4619 SINUSITIS ACUTE UNSP"
), class = "factor"), GENDER = structure(c(1L, 1L, 1L, 1L, 1L,
1L), .Label = "FEMALE", class = "factor"), AGE = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "0-2", class = "factor"), Mention_DRGU = c(5460L,
5460L, 17790L, 17790L, 9400L, 9400L), treatment_status = structure(c(1L,
2L, 1L, 2L, 1L, 2L), .Label = c("Total visits", "Treated"), class = "factor"),
diag_class_1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "Acute sinusitis", class = "factor"),
year = c(2007L, 2007L, 2007L, 2007L, 2008L, 2008L)), .Names = c("DIAG_CODE_1",
"GENDER", "AGE", "Mention_DRGU", "treatment_status", "diag_class_1",
"year"), row.names = c(1285L, 1286L, 1407L, 1410L, 1408L, 1411L
), class = "data.frame")
However with 432 rows, it's possible I could calculate that all manually but that would be incredibly time consuming. Isn't that what computers are for :p. If you guys could help me find ways to automate tasks within R that would be greatly appreciated.
Is there a way that R could create a resulting dataframe that would tell me the DIAG_CODE_1, GENDER, AGE, % treated, and the year? I've created (in Excel) what I want the output to look like so you guys can see what I mean.
I will be doing this sort of calculation for other respiratory diseases, so I'm looking to learn now that way I can make my life easier in the long run.

You could use dplyr
library(dplyr)
library(tidyr)
magma %>%
spread(treatment_status, Mention_DRGU) %>%
mutate(PercentageTreated=100*(Treated/`Total visits`)) %>%
select(-diag_class_1, -`Total visits`, -Treated)
# DIAG_CODE_1 GENDER AGE year PercentageTreated
#1 4610 SINUSITIS MAXILLARY ACUT FEMALE 0-2 2007 100
#2 4619 SINUSITIS ACUTE UNSP FEMALE 0-2 2007 100
#3 4619 SINUSITIS ACUTE UNSP FEMALE 0-2 2008 100

Try this:
magma2<-reshape(magma, idvar = c("DIAG_CODE_1","GENDER","AGE","diag_class_1","year"), timevar = "treatment_status", direction = "wide")
colnames(magma2)<-c("DIAG_CODE_1","GENDER","AGE","diag_class_1","year","Treated","TotVisits")
magma2$PercentageTreated<-as.numeric(as.character(magma2$Treated))/as.numeric(as.character(magma2$TotVisits))
head(magma2)

Related

count number of times string appears in a column

Can you think about an intuitive way of calculating the number of times the word space appears in a certain column? Or any other solution that is viable.
I basically want to know how many times the space key was pressed, however some participants made the mistake and pressed other keys which would also be considered a mistake. So I was wondering if I should go with the "key_resp.rt" column instead and count the number of response times instead. If you had any idea of how to do both it would be great as I may need to use both.
I used the following code but the results do not conform to the data.
Data %>% group_by(Participant, Session) %>% summarise(false_start = sum(str_count(key_resp.keys, "space")))
Here is a snippet of my data:
Participant RT Session key_resp.keys key_resp.rt
X 0.431265 1 ["space"] [2.3173399999941466]
X 0.217685 1
X 0.317435 2 ["space","space"] [0.6671900000001187,2.032510000000002] 2020.1.3 4
Y 0.252515 1
Y 0.05127 2 ["space","space","space","space","space","space","space","space","space"] [4.917419999999765,6.151149999999689,6.333714999999771,6.638249999999971,6.833514999999338,7.0362499999992,7.217724999999504,7.38576999999988,7.66913999999997]
dput(droplevels(head(Data_PVT)))
structure(list(Interval_stimulus = c(4.157783411, 4.876139922,
5.67011868, 9.338167417, 9.196342656, 7.62448411), Participant = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "ADH80254", class = "factor"),
RT = c(431.265, 277.99, 253.515, 310.53, 299.165, 539.46),
Session = c(1L, 1L, 1L, 1L, 1L, 1L), date = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "2020-06-12_11h11.47.141", class = "factor"),
key_resp.keys = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"[\"space\"]"), class = "factor"), key_resp.rt = structure(c(2L,
1L, 1L, 1L, 1L, 1L), .Label = c("", "[2.3173399999941466]"
), class = "factor"), psychopyVersion = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = "2020.1.3", class = "factor"),
Trials = 0:5, Reciprocal = c(2.31875992719094, 3.59725169970143,
3.94453977082224, 3.22030077609249, 3.3426370063343, 1.85370555740926
)), row.names = c(NA, 6L), class = "data.frame")
Expected output:
Participant Session false_start
x 1 0
x 2 1
y 1 2
y 2 1
z 1 10
z 2 3
We can use str_count to count "space" values for each Participant and Session and sum them to get total. For all_false_start we count number of words in it.
library(dplyr)
library(stringr)
df %>%
group_by(Participant, Session) %>%
summarise(false_start = sum(str_count(key_resp.keys, '\\bspace\\b')),
all_false_start = sum(str_count(key_resp.keys, '\\b\\w+\\b')))

How to calculate mean base on quantile for one of the column

Let say I want to find out the mean for other column group by the another column quantile.
For my table, I have several columns, now I got the 10% quantile for SalePrice column, there are some other numeric columns in my table(there are also some other factor variables in this table to).
And I want to calculate these variables' mean group by SalePrice column.
Then after that, I want to save these result in to a data frame.
I want to use loop to construct this data frame, I have some basic idea about the loop, but don't know how to finish it. Or add the column in the data frame in the loop
for (i in 1:lenth(tr)){
if(tr$i == numeric){
Result <- data.frame()
}
}
here is what I got for the SalePrice 10% quantile
> quantile(tr$SalePrice, c(seq(0, 1,0.1)),na.rm = TRUE, names = TRUE)
0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
34900 106450 124000 135500 147000 163000 179360 198740 230000 278000 755000
And my data look like this:
> dput(head(tr, 5))
structure(list(
MSSubClass = structure(c(6L, 1L, 6L, 7L, 6L), .Label = c("20", "30", "40", "45", "50", "60", "70", "75", "80", "85", "90", "120", "160", "180", "190"), class = "factor"),
MSZoning = structure(c(4L, 4L, 4L, 4L, 4L), .Label = c("C (all)", "FV", "RH", "RL", "RM"), class = "factor"),
LotFrontage = c(65, 80, 68, 60, 84),
LotArea = c(8450, 9600, 11250, 9550, 14260),
Street = structure(c(2L, 2L, 2L, 2L, 2L), .Label = c("Grvl", "Pave"), class = "factor"),
Alley = structure(c(2L, 2L, 2L, 2L, 2L), .Label = c("Grvl", "NA", "Pave"), class = "factor"),
LotShape = structure(c(4L, 4L, 1L, 1L, 1L), .Label = c("IR1", "IR2", "IR3", "Reg"), class = "factor"),
LandContour = structure(c(4L, 4L, 4L, 4L, 4L), .Label = c("Bnk", "HLS", "Low", "Lvl"), class = "factor"),
Utilities = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("AllPub", "NoSeWa"), class = "factor"),
LotConfig = structure(c(5L, 3L, 5L, 1L, 3L), .Label = c("Corner", "CulDSac", "FR2", "FR3", "Inside"), class = "factor"),
LandSlope = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("Gtl", "Mod", "Sev"), class = "factor"),
Neighborhood = structure(c(6L, 25L, 6L, 7L, 14L), .Label = c("Blmngtn", "Blueste", "BrDale", "BrkSide", "ClearCr", "CollgCr", "Crawfor", "Edwards", "Gilbert", "IDOTRR", "MeadowV", "Mitchel", "NAmes", "NoRidge", "NPkVill", "NridgHt", "NWAmes", "OldTown", "Sawyer", "SawyerW", "Somerst", "StoneBr", "SWISU", "Timber", "Veenker"), class = "factor"),
Condition1 = structure(c(3L, 2L, 3L, 3L, 3L), .Label = c("Artery", "Feedr", "Norm", "PosA", "PosN", "RRAe", "RRAn", "RRNe", "RRNn"), class = "factor"),
Condition2 = structure(c(3L, 3L, 3L, 3L, 3L), .Label = c("Artery", "Feedr", "Norm", "PosA", "PosN", "RRAe", "RRAn", "RRNn"), class = "factor"),
BldgType = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("1Fam", "2fmCon", "Duplex", "Twnhs","TwnhsE"), class = "factor"),
SalePrice = c(208500, 181500, 223500, 140000, 250000)), row.names = c(NA, 5L), class = "data.frame")
I only attach some variables here, not all of them.
You did not provide any data so I was left making a few assumptions. Assuming that your data is called df perhaps you can use dput(head(df, 100)) and copy and paste the output here?
If not does this work for you?
d1 <- runif(1000)
d2 <- runif(1000)
d3 <- runif(1000)
df <- data.frame(SalePrice = d1,
data2 = d2,
data3 = d3)
library(dplyr)
df %>%
mutate(Mydeciles = ntile(data2, 10)) %>%
group_by(Mydeciles) %>%
summarise(mean_sales_price = mean(SalePrice),
mean_data2 = mean(data2),
mean_data3 = mean(data3))
Output:
# A tibble: 10 x 4
Mydeciles mean_sales_price mean_data2 mean_data3
<int> <dbl> <dbl> <dbl>
1 1 0.497 0.0450 0.450
2 2 0.520 0.144 0.522
3 3 0.506 0.250 0.487
4 4 0.472 0.360 0.457
5 5 0.510 0.469 0.553
6 6 0.555 0.564 0.503
7 7 0.510 0.652 0.540
8 8 0.461 0.751 0.482
9 9 0.465 0.844 0.485
10 10 0.530 0.952 0.534
Solution 2:
df %>%
mutate(Mydeciles = ntile(SalePrice, 2)) %>%
group_by(Mydeciles) %>%
summarise_if(is.numeric, funs(mean))
Gives:
# A tibble: 2 x 4
Mydeciles LotFrontage LotArea SalePrice
<int> <dbl> <dbl> <dbl>
1 1 68.3 9200 176667.
2 2 76 12755 236750
A data.table answer:
library(data.table)
setDT(df)
df[, .(mean_price = mean(salesPrice), mean_r1 = mean(data1), mean_r2 = mean(data2)), by = .(qtl = quantile(salesPrice, seq(0, 1, 0.1)))]

How can I find a subsequent trial based on a condition?

I am using R to manipulate a large dataset (dataset) that consists of 20,000+ rows. In my data, I have three important columns to focus on for this question: Trial_Nr (consisting of 90 trials), seconds (increasing in .02 second increments), and threat(fixation to threat: 1=yes, 0=no, NA). Within each trial, I need to answer when the initially fixates to threat (1), how long does it take for them to not fixate on threat (0). So basically, within each trial, I would need to find the first threat=1 and the subsequent threat=0 and subtract the time. I am able to get the first threat with this code:
initalfixthreat <- dataset %>%
group_by(Trial_Nr) %>%
slice(which(threat == '1')[1])
I am stumped on how to get the subsequent threat=0 within that trial number.
Here is an example of the data (sorry don't know how to format it better):
So for Trial_Nr=1, I would be interested in 689.9 seconds- 689.8.
For Trial_Nr=2, I would want 690.04-689.96.
Please let me know if I was unclear and thank you all for your help!
One approach is:
library(dplyr)
df %>%
group_by(Trial_Nr) %>%
filter(!is.na(threat)) %>%
mutate(flag = ifelse(threat == 1, 1, threat - lag(threat))) %>%
filter(abs(flag) == 1 & !duplicated(flag)) %>%
summarise(timediff = ifelse(length(seconds) == 1, NA, diff(seconds)))
# A tibble: 2 x 2
Trial_Nr timediff
<int> <dbl>
1 1 0.1
2 2 0.0800
Data:
df <- structure(list(Trial_Nr = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L), seconds = c(689.76, 689.78, 689.8, 689.82,
689.84, 689.86, 689.88, 689.9, 689.92, 689.94, 689.96, 689.98,
690, 690.02, 690.04), threat = c(0L, 0L, 1L, 1L, 1L, NA, NA,
0L, 1L, 0L, 1L, NA, NA, 1L, 0L)), class = "data.frame", row.names = c(NA,
-15L))

Calculate Difference between dates by group in R

I'm using a logistic exposure to calculate hatching success for bird nests. My data set is quite extensive and I have ~2,000 nests, each with a unique ID ("ClutchID). I need to calculate the number of days a given nest was exposed ("Exposure"), or more simply, the difference between the 1st and last day. I used the following code:
HS_Hatch$Exposure=NA
for(i in 2:nrow(HS_Hatch)){HS_Hatch$Exposure[i]=HS_Hatch$DateVisit[i]- HS_Hatch$DateVisit[i-1]}
where HS_Hatch is my dataset and DateVisit is the actual date. The only problem is R is calculating an exposure value for the 1st date (which doesn't make sense).
What I really need is to calculate the difference between the 1st and last date for a given clutch. I've also looked into the following:
Exposure=ddply(HS_Hatch, "ClutchID", summarize,
orderfrequency = as.numeric(diff.Date(DateVisit)))
df %>%
mutate(Exposure = as.Date(HS_Hatch$DateVisit, "%Y-%m-%d")) %>%
group_by(ClutchID) %>%
arrange(Exposure) %>%
mutate(lag=lag(DateVisit), difference=DateVisit-lag)
I'm still learning R so any help would be greatly appreciated.
Edit:
Below is a sample of the data I'm using
HS_Hatch <- structure(list(ClutchID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L
), DateVisit = c("3/15/2012", "3/18/2012", "3/20/2012", "4/1/2012",
"4/3/2012", "3/18/2012", "3/20/2012", "3/22/2012", "4/3/2012",
"4/4/2012", "3/22/2012", "4/3/2012", "4/4/2012", "3/18/2012",
"3/20/2012", "3/22/2012", "4/2/2012", "4/3/2012", "4/4/2012",
"3/20/2012", "3/22/2012", "3/25/2012", "3/27/2012", "4/4/2012",
"4/5/2012"), Year = c(2012L, 2012L, 2012L, 2012L, 2012L, 2012L,
2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L,
2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L,
2012L), Survive = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -25L), .Names = c("ClutchID",
"DateVisit", "Year", "Survive"), spec = structure(list(cols = structure(list(
ClutchID = structure(list(), class = c("collector_integer",
"collector")), DateVisit = structure(list(), class = c("collector_character",
"collector")), Year = structure(list(), class = c("collector_integer",
"collector")), Survive = structure(list(), class = c("collector_integer",
"collector"))), .Names = c("ClutchID", "DateVisit", "Year",
"Survive")), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
Collecting some of the comments...
Load dplyr
We need only the dplyr package for this problem. If we load other packages, e.g. plyr, it can cause conflicts if both packages have functions with the same name. Let's load only dplyr.
library(dplyr)
In the future, you may wish to load tidyverse instead -- it includes dplyr and other related packages, for graphics, etc.
Converting dates
Let's convert the DateVisit variable from character strings to something R can interpret as a date. Once we do this, it allows R to calculate differences in days by subtracting two dates from each other.
HS_Hatch <- HS_Hatch %>%
mutate(date_visit = as.Date(DateVisit, "%m/%d/%Y"))
The date format %m/%d/%Y is different from your original code. This date format needs to match how dates look in your data. DateVisit has dates as month/day/year, so we use %m/%d/%Y.
Also, you don't need to specify the dataset for DateVisit inside mutate, as in HS_Hatch$DateVisit, because it's already looking in HS_Hatch. The code HS_Hatch %>% ... says 'use HS_Hatch for the following steps'.
Calculating exposures
To calculate exposure, we need to find the first date, last date, and then the difference between the two, for each set of rows by ClutchID. We use summarize, which collapses the data to one row per ClutchID.
exposure <- HS_Hatch %>%
group_by(ClutchID) %>%
summarize(first_visit = min(date_visit),
last_visit = max(date_visit),
exposure = last_visit - first_visit)
first_visit = min(date_visit) will find the minimum date_visit for each ClutchID separately, since we are using group_by(ClutchID).
exposure = last_visit - first_visit takes the newly-calculated first_visit and last_visit and finds the difference in days.
This creates the following result:
ClutchID first_visit last_visit exposure
<int> <date> <date> <dbl>
1 1 2012-03-15 2012-04-03 19
2 2 2012-03-18 2012-04-04 17
3 3 2012-03-22 2012-04-04 13
4 4 2012-03-18 2012-04-04 17
5 5 2012-03-20 2012-04-05 16
If you want to keep all the original rows, you can use mutate in place of summarize.
Here is a similar solutions if you look for a difftime results in days, from a vector date, without NA values produce in the new column, and if you expect to group by several conditions/groups.
make sure that your vector of date as been converting in the good format as previously explained.
dat2 <- dat %>%
select(group1, group2, date) %>%
arrange(group1, group2, date) %>%
group_by(group1, group2) %>%
mutate(diff_date = c(0,diff(date)))

R program, ?count, rename "freq" to something else

I am studying this webpage, and cannot figure out how to rename freq to something else, say number of times imbibed
Here is dput
structure(list(name = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L), .Label = c("Bill", "Llib"), class = "factor"), drink = structure(c(2L,
3L, 1L, 4L, 2L, 3L, 1L, 4L), .Label = c("cocoa", "coffee", "tea",
"water"), class = "factor"), cost = 1:8), .Names = c("name",
"drink", "cost"), row.names = c(NA, -8L), class = "data.frame")
And this is working code with output. Again, I'd like to rename the freq column. Thanks!
library(plyr)
bevs$cost <- as.integer(bevs$cost)
count(bevs, "name")
Output
name freq
1 Bill 4
2 Llib 4
Are you trying to do this?
counts <- count(bevs, "name")
names(counts) <- c("name", "number of times imbibed")
counts
The count() function returns a data.frame. Just rename it like any other data.frame:
counts <- count(bevs, "name")
names(counts)[which(names(counts) == "freq")] <- "number of times imbibed"
print(counts)
# name number of times imbibed
# 1 Bill 4
# 2 Llib 4

Resources