I have trouble figuring out how I can compute a simple mean with dplyr on Long Format data.
My data look like this :
hldid idno sex diary age
1 1294 1294_1 2 1 39
2 1294 1294_1 2 2 39
3 1294 1294_2 1 1 43
4 1294 1294_2 1 2 43
...
With 4 variables : hldid idno sex diary age
idno is the personal identifier but not the unique key.
Each individual is repeated 2 times, one for each diary filled.
What I would like is to simply compute the age mean by sex.
Could you help me out ?
I tried something like :
dta %>%
group_by(sex) %>%
mutate( ng = n_distinct(idno)) %>%
group_by(age, add=TRUE) %>%
summarise(mean = n()/ng[1] )
But it does not work.
The data :
dta = structure(list(hldid = c(1294, 1294, 1294, 1294, 1352, 1352,
1352, 1352, 3741, 3741, 3741, 3741, 3809, 3809, 3809, 3809, 4037,
4037, 4037, 4037), idno = c("1294_1", "1294_1", "1294_2", "1294_2",
"1352_1", "1352_1", "1352_2", "1352_2", "3741_1", "3741_1", "3741_2",
"3741_2", "3809_1", "3809_1", "3809_2", "3809_2", "4037_1", "4037_1",
"4037_2", "4037_2"), sex = c(2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L), diary = c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L), age = c(39L, 39L, 43L, 43L, 31L, 31L, 37L, 37L,
33L, 33L, 37L, 37L, 34L, 34L, 37L, 37L, 41L, 41L, 32L, 32L)), .Names = c("hldid",
"idno", "sex", "diary", "age"), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -20L), vars = list(hldid), drop = TRUE, indices = list(
0:3, 4:7, 8:11, 12:15, 16:19), group_sizes = c(4L, 4L, 4L,
4L, 4L), biggest_group_size = 4L, labels = structure(list(hldid = c(1294,
1352, 3741, 3809, 4037)), class = "data.frame", row.names = c(NA,
-5L), .Names = "hldid", vars = list(hldid)))
quick update
Maybe this does not apply for this example,
but this kind of issues I have in mind is the following :
Imagine we have data like this :
3 women and 2 men, and a dummy act variable.
If we do and not taking into account the Long format computing the mean, we will have problems.
aggregate(act ~ sex, FUN = mean, data = dtaTime)
What we should do is this :
aggregate(act ~ sex, FUN = sum, data = dtaTime)
6 / 2 # men
10 / 3 # women
Data
dtaTime = structure(list(id = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L),
sex = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), act = c(1L,
1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L)), .Names = c("id", "sex",
"act"), class = "data.frame", row.names = c(NA, -25L))
You are making it too complicated,
dta %>%
group_by(sex) %>%
summarise(meanage = mean(age))
should give you the mean age by sex.
A base R alternative:
aggregate(age ~ sex, dta, mean)
A data.table alternative:
library(data.table)
setDT(dta)[, .(meanage = mean(age)), by = sex]
Related
I have a data frame that looks like this:
I need to make a new data frame that takes the values for density from the expr_phenotype column and puts them in the formula: ((density for 4 + density for 68)/(density for 0+4+64+68)*100). How should I go about this?
I know it's easier to make a reproducible example but this is a funky dataset.
We can use dplyr::group_by, then use summarise:
library(dplyr)
data %>%
group_by(core_x,core_y) %>%
summarise(result = sum(density[expr_phenotype %in% c(4,68)])/
sum(density[expr_phenotype %in% c(0,4,64,68)]) * 100)
## A tibble: 3 x 3
## Groups: core_x [1]
# core_x core_y result
# <int> <int> <dbl>
#1 1 1 39.8
#2 1 2 39.1
#3 1 3 51.8
Data (Obtained by OCR, please forgive errors)
data <- structure(list(core_x = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), core_y = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L), core = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L), .Label = c("1--1", "1--2",
"1--3"), class = "factor"), xX = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "Cc", class = "factor"), phenotype = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "CD163", class = "factor"),
expr_phenotype = c(0L, 4L, 64L, 68L, 0L, 4L, 64L, 68L, 0L,
4L), count = c(510L, 334L, 1L, 4L, 186L, 116L, 1L, 3L, 196L,
210L), density = c(451L, 295L, 1L, 4L, 164L, 103L, 1L, 3L,
173L, 186L)), row.names = 3:12, class = "data.frame")
Can someone help me how to count from another dataframe?
df1(out)
structure(list(Item = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), class = "factor", .Label = "0S1576"), LC = structure(c(1L,
1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), class = "factor", .Label = c("MW92",
"OY01", "RM11")), Fiscal.Month = c("2019-M06", "2019-M07", "2019-M06",
"2019-M07", "2019-M08", "2019-M09", "2019-M06", "2019-M07", "2019-M08"
)), row.names = c(NA, -9L), class = "data.frame")
df2(tempdf)
structure(list(Item = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "0S1576", class = "factor"),
LC = structure(c(1L, 1L, 1L, 1L, 2L, 3L, 4L, 6L, 5L, 1L,
2L, 2L, 3L, 3L), .Label = c("MW92", "OY01", "RM11", "RS11",
"WK14", "WK15"), class = "factor"), Fiscal.Month = structure(c(1L,
2L, 3L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("2019-M06",
"2019-M07", "2019-M08", "2019-M09"), class = "factor"), fcst = c(22L,
21L, 20L, 19L, 12L, 10L, 10L, 12L, 10L, 12L, 10L, 10L, 10L,
10L)), row.names = c(NA, -14L), class = "data.frame")
I want to count the frequency of Item,LC,Fiscal.month of df1 from df2
You can count using table and merge df1 with df2 by using factor and you need interaction as you use more than one column to merge.
table(factor(interaction(df2[c("Item","LC","Fiscal.Month")]), levels=interaction(df1)))
#0S1576.MW92.2019-M06 0S1576.MW92.2019-M07 0S1576.OY01.2019-M06
# 2 1 3
#0S1576.OY01.2019-M07 0S1576.OY01.2019-M08 0S1576.OY01.2019-M09
# 0 0 0
#0S1576.RM11.2019-M06 0S1576.RM11.2019-M07 0S1576.RM11.2019-M08
# 3 0 0
Or a speed improved version using match and tabulate:
(df1$freq <- tabulate(match(interaction(df2[c("Item","LC","Fiscal.Month")]), interaction(df1)), nrow(df1)))
#[1] 2 1 3 0 0 0 3 0 0
Or sometimes even faster using fastmatch:
library(fastmatch)
df1$freq <- tabulate(fmatch(interaction(df2[c("Item","LC","Fiscal.Month")]), interaction(df1)), nrow(df1))
I am trying to aggregate dataset in R with median.
d <- aggregate(c(d$user_reported_percent, d$machine_percent),
by = list(d$day), FUN=median, simplify = TRUE, drop = TRUE)
But R keeps complaining, and I am not sure if it even makes sense to aggregate with median.
Some of the errors R gives me:
Error in aggregate.data.frame(as.data.frame(x), ...) :
arguments must have same length
Then i tried to use mutate to at least find the median
d <- d %>% group_by(day) %>% mutate(median=median(user_reported_percent))
error was:
Error: invalid subscript type 'integer'
I would appreciate any help!
Thanks a lot!
P.S with mean everything works perfectly fine
my dataset looks like this:
structure(list(esmFollValue = c(36.00852, 8.688648, 0.6372048,
13.7394, 0.7599012, 16.43628, 7.569684, 0.4502016, 0.7630464,
0.781386, 0.5116056, 0.858756, 18.06108, 0.5473332, 14.62944,
14.62944, 14.07216, 0.5366868, 14.12892, 0.7354944), esmHappValue = c(100L,
80L, 80L, 80L, 60L, 80L, 60L, 60L, 80L, 60L, 100L, 60L, 80L,
60L, 60L, 60L, 60L, 60L, 80L, 60L), deviceId = structure(c(11L,
11L, 11L, 6L, 3L, 15L, 3L, 3L, 15L, 3L, 15L, 15L, 15L, 15L, 3L,
3L, 15L, 3L, 9L, 9L), .Label = c("1e6c1183-af64-4860-b2d6-533cab7afe6c",
"34209e3d-1a82-4f75-95c8-846be8a1be03", "7066f4af-82f3-4369-8f45-70d1ea3d22f2",
"7cf78328-60c5-4564-9dd0-309cb0b3d5ad", "95b11f22-91e8-46d0-88d9-4f197267aa29",
"a0c89d2a-d22d-41d0-a070-b9887d911953", "cde8cc10-7212-4a41-ae9b-bbeb51dbe8ed",
"d150bfa4-0b52-47a0-b450-1eb21aaada53", "d41db7bc-2b81-4111-9b32-a0aab55cb25a",
"d7e8e8c7-5190-4f0b-aa49-72e520bc9aad", "dd1218a2-4e67-4cbf-bf4d-9e288865aa63",
"f093abf9-22e1-47e6-ae5d-1238629d8542", "fae0dd29-2b89-4c1d-b5ad-7858abe122ac",
"feeb0ab0-7d13-4a5c-b0df-58dd85c7f607", "ff883e61-c9a9-4e6b-8b6b-cab3e5535879"
), class = "factor"), timestamp = c(1457272936.882, 1457337998.931,
1457424251.996, 1457429767.632, 1457597635.755, 1457683537.604,
1457861178.161, 1457964712.356, 1458029223.54, 1458046931.652,
1458051135.219, 1458115293.069, 1458133652.503, 1458202019.302,
1458203945.674, 1458203945.787, 1458306790.803, 1458308783.441,
1458460903.755, 1458480932.088), group = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("groupA", "groupB", "groupC", "groupD"), class = "factor"),
cameraFeed = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Non-visible camera feed",
"Visible camera feed"), class = "factor"), timegroup = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L,
2L, 2L, 1L, 2L), .Label = c("Day", "Evening"), class = "factor"),
day = structure(c(4L, 2L, 6L, 6L, 5L, 1L, 4L, 2L, 6L, 6L,
6L, 7L, 7L, 5L, 5L, 5L, 1L, 1L, 4L, 4L), .Label = c("Friday",
"Monday", "Saturday", "Sunday", "Thursday", "Tuesday", "Wednesday"
), class = "factor"), user_reported_percent = c(83.3333333333333,
66.6666666666667, 66.6666666666667, 66.6666666666667, 50,
66.6666666666667, 50, 50, 66.6666666666667, 50, 83.3333333333333,
50, 66.6666666666667, 50, 50, 50, 50, 50, 66.6666666666667,
50), machine_percent = c(30.0071, 7.24054, 0.531004, 11.4495,
0.633251, 13.6969, 6.30807, 0.375168, 0.635872, 0.651155,
0.426338, 0.71563, 15.0509, 0.456111, 12.1912, 12.1912, 11.7268,
0.447239, 11.7741, 0.612912)), .Names = c("esmFollValue",
"esmHappValue", "deviceId", "timestamp", "group", "cameraFeed",
"timegroup", "day", "user_reported_percent", "machine_percent"
), row.names = c(NA, 20L), class = "data.frame")
and I would like to have one value of percent per each day.
with the help of #nicola I used this:
aggregate(d[,c("user_reported_percent","machine_percent")],by = list(d$day), FUN=median)
and everything worked fine.
Thanks a lot!
I have 2 data frames for 2 stacks that gives information about potential emission. One data frame gives the time frame of what hours the system turn on and off for 4 seasons. Each season start on specific date. The 2nd file give me the details of the stack.
I am trying with some sample file to test how to do this and so far I have managed to create a function following stack overflow example that allow me to create a data frame with the dates that I would like and a column with seasons for each date. I am really struggling now with the programming concept to understand how do I combine the 3 data frames to create the output template that I am trying to set up.
To show you an example my sample input are:
Stack_info File:
example seasonal Profile that shows when the system is on or off:
and the output I am after should create data frames for each year in the following format (only the black font and the red text to just explain what the values are):
What is the most difficult I am finding is that my output files for each year will have a unique first Row and the 2nd row will repeat for each pollutant. and from 3rd row the hourly data for all 8760 hours. This need to repeat for the next pollutant.
So far I have managed to create a function that helps me to assign season to each day of the year. For example:
#function to create seasons
d = function(month_day) which(lut$month_day == month_day)
lut = data.frame(all_dates = as.POSIXct("2012-1-1") + ((0:365) * 3600 * 24),
season = NA)
lut = within(lut, { month_day = strftime(all_dates, "%b-%d") })
lut[c(d("Jan-01"):d("Mar-15"), d("Nov-08"):d("Dec-31")), "season"] = "winter"
lut[c(d("Mar-16"):d("Apr-30")), "season"] = "spring"
lut[c(d("May-01"):d("Sep-27")), "season"] = "summer"
lut[c(d("Sep-28"):d("Nov-07")), "season"] = "autumn"
rownames(lut) = lut$month_day
## create date data frame and assign seasons
dates = data.frame(dates =seq(as.Date('2010-01-01'),as.Date('2012-12-31'),by = 1))
dates = within(dates, {
season = lut[strftime(dates, "%b-%d"), "season"]
})
This gives me a dates data frame and my other 2 samples data frames are (as shown in the image):
structure(list(`Source no` = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), Source = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("Stack 1", "Stack 2"), class = "factor"),
Period = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), Day = structure(c(2L,
6L, 7L, 5L, 1L, 3L, 4L, 2L, 6L, 7L, 5L, 1L, 3L, 4L, 2L, 6L,
7L, 5L, 1L, 3L, 4L), .Label = c("Fri", "Mon", "Sat", "Sun",
"Thu", "Tue", "Wed"), class = "factor"), `Spring On` = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 15L,
15L, 15L, 15L, 15L, 15L, 15L), `Spring Off` = c(23L, 23L,
23L, 23L, 23L, 23L, 23L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 18L,
18L, 18L, 18L, 18L, 18L, 18L), `Summer On` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Summer Off` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Autumn On` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Autumn Off` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Winter On` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("0", "off"), class = "factor"),
`Winter Off` = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("23",
"off"), class = "factor")), .Names = c("Source no", "Source",
"Period", "Day", "Spring On", "Spring Off", "Summer On", "Summer Off",
"Autumn On", "Autumn Off", "Winter On", "Winter Off"), class = "data.frame", row.names = c(NA,
-21L)) -> profile
structure(list(SNAME = structure(1:2, .Label = c("Stack 1", "Stack 2"
), class = "factor"), ISVARY = c(1L, 4L), VELVOL = c(1L, 4L),
TEMPDENS = c(0L, 2L), `DUM 1` = c(999L, 999L), `DUM 2` = c(999L,
999L), NPOL = c(2L, 2L), `EXIT VEL` = c(26.2, 22.4), TEMP = c(341L,
328L), `STACK DIAM` = c(1.5, 2.5), W = c(0L, 15L), Nox = c(39,
33.3), Sox = c(15.5, 17.9)), .Names = c("SNAME", "ISVARY",
"VELVOL", "TEMPDENS", "DUM 1", "DUM 2", "NPOL", "EXIT VEL", "TEMP",
"STACK DIAM", "W", "Nox", "Sox"), class = "data.frame", row.names = c(NA,
-2L)) -> stack_info
If anyone could give me any guidance of how I can proceed with the programming part would be really useful as I am just not sure how I can approach this to create separate output files as data frame for year 2010, 2011 and 2012.
The way your data is organised isn't ideal for processing. Maybe you have a look at Hadley Wickhams papar about tidy data.
According to your desired output you need a dataframe with the number of lines equal to the number of hours a specific machine (stack n) is switched on. Therefore I suggest you create a dataframe containing every hour of a given year:
d.out = data.frame(dates = seq(from=as.POSIXct("2010-01-01"), by=3600, to= as.POSIXct("2010-12-31")))
d.out$year = as.numeric(format(d.out$dates, "%Y"))
d.out$month = as.numeric(format(d.out$dates, "%m"))
d.out$day = as.numeric(format(d.out$dates, "%d"))
d.out$hour = as.numeric(format(d.out$dates, "%H"))
d.out$weekday = as.character(format(d.out$dates, "%a"))
d.out$doj = as.numeric(format(d.out$dates, "%j"))
d.out$season = "Winter"
d.out$season[d.out$doj >= 75 & d.out$doj < 121] = "Spring"
d.out$season[d.out$doj >= 121 & d.out$doj < 271] = "Summer"
d.out$season[d.out$doj >= 271 & d.out$doj < 312] = "Autumn"
The goal is to join this dataframe with your profile dataframe. Before joining, the profile-df has to be rearranged:
library(dplyr)
library(tidyr)
profile_new =
profile %>%
gather(season, hour, -c(`Source no`, Source, Period, Day)) %>%
extract(season, c("season", "status"), "(\\w+?)\\s(\\w+)") %>%
filter(hour != "off") %>%
mutate(Day = as.character(Day), hour=as.numeric(hour)) %>%
spread(status, hour)
Now it's easy to join the three dataframes to put together all the information you need to create your output:
d.out %>%
inner_join(profile_new, by=c("weekday"="Day", "season"="season")) %>%
group_by(Source, dates, year, day, weekday, season, hour) %>%
summarise(status = any(hour >= On & hour <= Off)) %>%
inner_join(stack_info, by=c("Source"="SNAME")) %>%
mutate(Nox = ifelse(status, Nox, 0),
Sox = ifelse(status, Sox, 0)) %>%
arrange(Source, year, dates, hour) %>%
select(Source, year, day, weekday, season, hour, `EXIT VEL`, TEMP, `STACK DIAM`, W, Nox, Sox)
Obviously it's not quite the format you posted. From here you could write your dataframe to a csv (stack by stack by using append = TRUE).
So, I have a data frame with several continuous variables and several dummy variables. The survey that this data frame comes from uses 6,7,8 and 9 to denote different types of non-response. So, I would like to replace 6,7,8 and 9 with NA whenever they show up in a dummy variable column but leave them be in the continuous variable column.
Is there a concise way to go about doing this?
Here's my data:
> dput(head(sfsuse[c(4:16)]))
structure(list(famsize = c(3L, 1L, 2L, 5L, 3L, 5L), famtype = c(2L,
1L, 2L, 3L, 2L, 3L), cc = c(1L, 1L, 1L, 1L, 1L, 1L), nocc = c(1L,
1L, 1L, 3L, 1L, 1L), pdloan = c(2L, 2L, 2L, 2L, 2L, 2L), help = c(2L,
2L, 2L, 2L, 2L, 2L), budget = c(1L, 1L, 1L, 1L, 2L, 2L), income = c(340000L,
20500L, 0L, 165000L, 95000L, -320000L), govtrans = c(7500L, 15500L,
22000L, 350L, 0L, 9250L), childexp = c(0L, 0L, 0L, 0L, 0L, 0L
), homeown = c(1L, 1L, 1L, 1L, 1L, 2L), bank = c(2000L, 80000L,
25000L, 20000L, 57500L, 120000L), vehval = c(33000L, 7500L, 5250L,
48000L, 8500L, 50000L)), .Names = c("famsize", "famtype", "cc",
"nocc", "pdloan", "help", "budget", "income", "govtrans", "childexp",
"homeown", "bank", "vehval"), row.names = c(NA, 6L), class = "data.frame")
I'm trying to subs in NA for 6,7,8 and 9 in columns 3:7 and column 11. I know how to do this one column at a time by the column names:
df$name[df$name %in% 6:9]<-NA
but I would have to do this for each column by name, is there a concise way to do it by column index?
Thanks
This function should work
f <- function(data,k) {
data[data[,k] %in% 6:9,k] <- NA
data
}
Now at the console:
> for (k in c(3:7,11)) { data <- f(data,k) }