Related
I have a dataframe that looks something like this, in which I have several rows for each user, and many NAs in the columns.
user
Effect T1
Effect T2
Effect T3
Benchmark T1
Benchmark T2
Benchmark T3
Tom
01
NA
NA
02
NA
NA
Tom
NA
07
NA
NA
08
NA
Tom
NA
NA
13
NA
NA
14
Larry
03
NA
NA
04
NA
NA
Larry
NA
09
NA
NA
10
NA
Larry
NA
NA
15
NA
NA
16
Dave
05
NA
NA
06
NA
NA
Dave
NA
11
NA
NA
12
NA
Dave
NA
NA
17
NA
NA
18
I want to collapse the columns using the name and filling the values from reach row, this this.
user
Effect T1
Effect T2
Effect T3
Benchmark T1
Benchmark T2
Benchmark T3
Tom
01
07
13
02
08
14
Larry
03
09
15
04
10
16
Dave
05
11
17
06
12
18
How might I accomplish this?
Thank you in advance for your help. Update: I've added the dput of a subset of the actual data below.
structure(list(name = c("Abraham_Ralph", "Abraham_Ralph", "Abraham_Ralph",
"Ackerman_Gary", "Adams_Alma", "Adams_Alma", "Adams_Alma", "Adams_Alma",
"Adams_Sandy", "Aderholt_Robert", "Aderholt_Robert", "Aderholt_Robert",
"Aderholt_Robert", "Aderholt_Robert", "Aguilar_Pete", "Aguilar_Pete",
"Aguilar_Pete"), state = c("LA", "LA", "LA", "NY", "NC", "NC",
"NC", "NC", "FL", "AL", "AL", "AL", "AL", "AL", "CA", "CA", "CA"
), seniority = c(1, 2, 3, 15, 1, 2, 3, 4, 1, 8, 9, 10, 11, 12,
1, 2, 3), legeffect_112 = c(NA, NA, NA, 0.202061712741852, NA,
NA, NA, NA, 1.30758035182953, 3.73544979095459, NA, NA, NA, NA,
NA, NA, NA), legeffect_113 = c(NA, NA, NA, NA, 0, NA, NA, NA,
NA, NA, 0.908495426177979, NA, NA, NA, NA, NA, NA), legeffect_114 = c(2.07501077651978,
NA, NA, NA, NA, 0.84164834022522, NA, NA, NA, NA, NA, 0.340001106262207,
NA, NA, 0.10985741019249, NA, NA), legeffect_115 = c(NA, 0.493490308523178,
NA, NA, NA, NA, 0.587624311447144, NA, NA, NA, NA, NA, 0.159877583384514,
NA, NA, 0.730929613113403, NA), legeffect_116 = c(NA, NA, 0.0397605448961258,
NA, NA, NA, NA, 1.78378939628601, NA, NA, NA, NA, NA, 0.0198802724480629,
NA, NA, 0.0497006773948669), benchmark_112 = c(NA, NA, NA, 0.738679468631744,
NA, NA, NA, NA, 0.82908970117569, 1.39835929870605, NA, NA, NA,
NA, NA, NA, NA), benchmark_113 = c(NA, NA, NA, NA, 0.391001850366592,
NA, NA, NA, NA, NA, 1.58223271369934, NA, NA, NA, NA, NA, NA),
benchmark_114 = c(1.40446054935455, NA, NA, NA, NA, 0.576326191425323,
NA, NA, NA, NA, NA, 1.42212760448456, NA, NA, 0.574363172054291,
NA, NA), benchmark_115 = c(NA, 1.3291300535202, NA, NA, NA,
NA, 0.537361204624176, NA, NA, NA, NA, NA, 1.45703768730164,
NA, NA, 0.523149251937866, NA), benchmark_116 = c(NA, NA,
0.483340591192245, NA, NA, NA, NA, 1.31058621406555, NA,
NA, NA, NA, NA, 0.751261711120605, NA, NA, 1.05683290958405
)), row.names = c(NA, -17L), class = c("tbl_df", "tbl", "data.frame"
))
A data.table solution:
# melt data, remove NA, then recast ...
dt <- dcast(melt(data.table(d), "name")[!value %in% NA], name ~ variable)
dcast(melt(data.table(d), "name")[!value %in% c(NA) & !variable %in% c("variable", "seniority", "state")], name ~ variable)
name legeffect_112 legeffect_113 legeffect_114 legeffect_115 legeffect_116 benchmark_112 benchmark_113 benchmark_114 benchmark_115 benchmark_116
1: Abraham_Ralph <NA> <NA> 2.07501077651978 0.493490308523178 0.0397605448961258 <NA> <NA> 1.40446054935455 1.3291300535202 0.483340591192245
2: Ackerman_Gary 0.202061712741852 <NA> <NA> <NA> <NA> 0.738679468631744 <NA> <NA> <NA> <NA>
3: Adams_Alma <NA> 0 0.84164834022522 0.587624311447144 1.78378939628601 <NA> 0.391001850366592 0.576326191425323 0.537361204624176 1.31058621406555
4: Adams_Sandy 1.30758035182953 <NA> <NA> <NA> <NA> 0.82908970117569 <NA> <NA> <NA> <NA>
5: Aderholt_Robert 3.73544979095459 0.908495426177979 0.340001106262207 0.159877583384514 0.0198802724480629 1.39835929870605 1.58223271369934 1.42212760448456 1.45703768730164 0.751261711120605
6: Aguilar_Pete <NA> <NA> 0.10985741019249 0.730929613113403 0.0497006773948669 <NA> <NA> 0.574363172054291 0.523149251937866 1.05683290958405
Data/Setup
# Load data.table
# install.packages("data.table")
library(data.table)
# Read example data
d <- structure(list(name = c("Abraham_Ralph", "Abraham_Ralph", "Abraham_Ralph",
"Ackerman_Gary", "Adams_Alma", "Adams_Alma", "Adams_Alma", "Adams_Alma",
"Adams_Sandy", "Aderholt_Robert", "Aderholt_Robert", "Aderholt_Robert",
"Aderholt_Robert", "Aderholt_Robert", "Aguilar_Pete", "Aguilar_Pete",
"Aguilar_Pete"), state = c("LA", "LA", "LA", "NY", "NC", "NC",
"NC", "NC", "FL", "AL", "AL", "AL", "AL", "AL", "CA", "CA", "CA"
), seniority = c(1, 2, 3, 15, 1, 2, 3, 4, 1, 8, 9, 10, 11, 12,
1, 2, 3), legeffect_112 = c(NA, NA, NA, 0.202061712741852, NA,
NA, NA, NA, 1.30758035182953, 3.73544979095459, NA, NA, NA, NA,
NA, NA, NA), legeffect_113 = c(NA, NA, NA, NA, 0, NA, NA, NA,
NA, NA, 0.908495426177979, NA, NA, NA, NA, NA, NA), legeffect_114 = c(2.07501077651978,
NA, NA, NA, NA, 0.84164834022522, NA, NA, NA, NA, NA, 0.340001106262207,
NA, NA, 0.10985741019249, NA, NA), legeffect_115 = c(NA, 0.493490308523178,
NA, NA, NA, NA, 0.587624311447144, NA, NA, NA, NA, NA, 0.159877583384514,
NA, NA, 0.730929613113403, NA), legeffect_116 = c(NA, NA, 0.0397605448961258,
NA, NA, NA, NA, 1.78378939628601, NA, NA, NA, NA, NA, 0.0198802724480629,
NA, NA, 0.0497006773948669), benchmark_112 = c(NA, NA, NA, 0.738679468631744,
NA, NA, NA, NA, 0.82908970117569, 1.39835929870605, NA, NA, NA,
NA, NA, NA, NA), benchmark_113 = c(NA, NA, NA, NA, 0.391001850366592,
NA, NA, NA, NA, NA, 1.58223271369934, NA, NA, NA, NA, NA, NA),
benchmark_114 = c(1.40446054935455, NA, NA, NA, NA, 0.576326191425323,
NA, NA, NA, NA, NA, 1.42212760448456, NA, NA, 0.574363172054291,
NA, NA), benchmark_115 = c(NA, 1.3291300535202, NA, NA, NA,
NA, 0.537361204624176, NA, NA, NA, NA, NA, 1.45703768730164,
NA, NA, 0.523149251937866, NA), benchmark_116 = c(NA, NA,
0.483340591192245, NA, NA, NA, NA, 1.31058621406555, NA,
NA, NA, NA, NA, 0.751261711120605, NA, NA, 1.05683290958405
)), row.names = c(NA, -17L), class = c("tbl_df", "tbl", "data.frame"
))
This solution is using only the base functions (no extra packages), but the one-liner may cause eyes to cross, so I'll split it into several functions.
The plan is the following:
Split the original data.frame by the values in name column, using the function by;
For each partition of the data.frame, collapse the columns;
A collapsed column returns the max value of the column, or NA if all its values are NA;
The collapsed data.frame partitions are stacked together.
So, this is a function that does that:
dfr_collapse <- function(dfr, col0)
{
# Collapse the columns of the data.frame "dfr" grouped by the values of
# the column "col0"
# Max/NA function
namax <- function(x)
{
if(all(is.na(x)))
NA # !!!
else
max(x, na.rm=TRUE)
}
# Column collapse function
byfun <- function(x)
{
lapply(x, namax)
}
# Stack the partitioning results
return(do.call(
what = rbind,
args = by(dfr, dfr[[col0]], byfun)
))
}
May not look as slick as a one-liner, but it does the job. It can be tunrned into a one-liner, but you don't want that.
Assuming that df0 is the data.frame from you dput, you can test this function with
dfr_collapse(df0)
Nota bene: for the sake of simplicity, I return an NA of type logical (see the comment # !!! above). The correct code should convert that NA to the mode of the x vector. Also, the function should check the type of its inputs, etc.
I have this data below:
## ID DOB sector meters Oct Res_FROM Res_TO Exp_FROM
## 1 20100 1979-08-24 H38 6400 W 1979-08-15 1991-05-15 1979-08-24
## 2 20101 1980-05-05 B01 1600 NW 1980-05-15 1991-04-15 1980-05-15
## 3 20102 1979-03-17 H04 1600 SW 1972-06-15 1979-08-15 1979-03-17
## 4 20103 1981-11-30 B09 3200 NE 1982-01-15 1984-01-15 1982-01-15
## 5 20103 1981-11-30 B37 8000 N 1984-01-15 1986-04-15 1984-01-15
## 6 20104 1978-09-01 B09 3200 NE 1982-01-15 1984-01-15 1982-01-15
## Exp_TO Exps_Grp Yr1952 Yr1953 Yr1954 Yr1955 Yr1956 Yr1957 Yr1958 Yr1959
## 1 1988-12-31 fr51>88 NA NA NA NA NA NA NA NA
## 2 1988-12-31 fr51>88 NA NA NA NA NA NA NA NA
## 3 1979-08-15 between NA NA NA NA NA NA NA NA
## 4 1984-01-15 between NA NA NA NA NA NA NA NA
## 5 1986-04-15 between NA NA NA NA NA NA NA NA
## 6 1984-01-15 between NA NA NA NA NA NA NA NA
## Yr1960 Yr1961 Yr1962 Yr1963 Yr1964 Yr1965 Yr1966 Yr1967 Yr1968 Yr1969 Yr1970
## 1 NA NA NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA NA
## Yr1971 Yr1972 Yr1973 Yr1974 Yr1975 Yr1976 Yr1977 Yr1978 Yr1979 Yr1980
## 1 NA NA NA NA NA NA NA NA 5.950991 4.340588
## 2 NA NA NA NA NA NA NA NA NA 2.927725
## 3 NA NA NA NA NA NA NA NA 20.608986 NA
## 4 NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA
## Yr1981 Yr1982 Yr1983 Yr1984 Yr1985 Yr1986 Yr1987 Yr1988
## 1 4.340588 4.340588 4.340588 4.3405881 4.340588 4.3405881 4.340588 1.083782
## 2 4.447229 4.447229 4.447229 4.4472289 4.447229 4.4472289 4.447229 1.110409
## 3 NA NA NA NA NA NA NA NA
## 4 NA 15.365412 16.018407 0.6529943 NA NA NA NA
## 5 NA NA NA 2.9414202 3.052618 0.6918076 NA NA
## 6 NA 15.365412 16.018407 0.6529943 NA NA NA NA
## Yrs_Exp arth_mean median cumulative caldate Age Month_Res
## 1 9.3616438 4.175948 4.340588 41.759478 12/31/88 9 141
## 2 8.6356164 3.907637 4.447229 35.168736 12/31/88 9 131
## 3 0.4136986 20.608986 20.608986 20.608986 12/31/88 9 86
## 4 2.0000000 10.678938 15.365412 32.036813 12/31/88 9 24
## 5 2.2493151 2.228615 2.941420 6.685846 12/31/88 8 27
## 6 2.0000000 10.678938 15.365412 32.036813 12/31/88 9 24
I have talked with a couple other folks and they have recommended that a normalize the cumulative exposures for each person (ID) based on population for each sector and the average residence time per sector. I have a couple questions. First, how would I go about creating the R code to determine how many people (IDs) are in each sector, and then how would I calculate average residence time per sector (the month_res column gives how many months residents lived in that sector)? I've tried this R code to separate each sector by the total number of IDs in it, but the error 'sum' not relevant for factors was given.
Fernald_Normalized$ID <- as.factor(Fernald_Normalized$ID)
Fernald_1 <- aggregate(Fernald_Normalized$ID, list(Fernald_Normalized$sector), FUN=sum)
If I keep ID as numeric, it sums the IDs themselves in each sector and produces a large number. Additionally, once I calculate the number of IDs per sector and the average residence time per sector, how would I use R to actually normalize this? I have a basic understanding of why we would normalize and generally what is done for normalization, but I haven't been able to create code for this in R. Reproducible dataset below. This is only a small snippet, in reality there are around 14,000 rows.
dat <- structure(list(UC_ID = c(20100L, 20101L, 20102L, 20103L, 20103L,
20104L, 20104L, 20105L, 20105L, 20106L, 20106L), DOB = c("1979-08-24",
"1980-05-05", "1979-03-17", "1981-11-30", "1981-11-30", "1978-09-01",
"1978-09-01", "1980-12-03", "1980-12-03", "1978-04-25", "1978-04-25"
), sector = c("H38", "B01", "H04", "B09", "B37", "B09", "B37",
"B09", "B09", "B09", "B09"), meters = c(6400L, 1600L, 1600L,
3200L, 8000L, 3200L, 8000L, 3200L, 3200L, 3200L, 3200L), Oct = c("W",
"NW", "SW", "NE", "N", "NE", "N", "NE", "NE", "NE", "NE"), Res_FROM = c("1979-08-15",
"1980-05-15", "1972-06-15", "1982-01-15", "1984-01-15", "1982-01-15",
"1984-01-15", "1980-12-15", "1983-08-15", "1978-04-15", "1983-08-15"
), Res_TO = c("1991-05-15", "1991-04-15", "1979-08-15", "1984-01-15",
"1986-04-15", "1984-01-15", "1986-04-15", "1983-08-15", "1991-03-15",
"1983-08-15", "2000-01-15"), Exp_FROM = c("1979-08-24", "1980-05-15",
"1979-03-17", "1982-01-15", "1984-01-15", "1982-01-15", "1984-01-15",
"1980-12-15", "1983-08-15", "1978-04-25", "1983-08-15"), Exp_TO = c("1988-12-31",
"1988-12-31", "1979-08-15", "1984-01-15", "1986-04-15", "1984-01-15",
"1986-04-15", "1983-08-15", "1988-12-31", "1983-08-15", "1988-12-31"
), Exps_Grp = c("fr51>88", "fr51>88", "between", "between", "between",
"between", "between", "between", "fr51>88", "between", "fr51>88"
), Yr1952 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1953 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1954 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1955 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Yr1956 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Yr1957 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1958 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), Yr1959 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1960 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1961 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1962 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Yr1963 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Yr1964 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1965 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), Yr1966 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1967 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1968 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1969 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Yr1970 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Yr1971 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1972 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), Yr1973 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1974 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1975 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1976 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Yr1977 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Yr1978 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
79.39642441, NA), Yr1979 = c(5.950991161, NA, 20.60898553, NA,
NA, NA, NA, NA, NA, 59.94484924, NA), Yr1980 = c(4.340588078,
2.927724588, NA, NA, NA, NA, NA, 0.758267013, NA, 16.01840668,
NA), Yr1981 = c(4.340588078, 4.447228937, NA, NA, NA, NA, NA,
16.01840668, NA, 16.01840668, NA), Yr1982 = c(4.340588078, 4.447228937,
NA, 15.36541238, NA, 15.36541238, NA, 16.01840668, NA, 16.01840668,
NA), Yr1983 = c(4.340588078, 4.447228937, NA, 16.01840668, NA,
16.01840668, NA, 9.952203009, 6.066203667, 9.952203009, 6.066203667
), Yr1984 = c(4.340588078, 4.447228937, NA, 0.652994292, 2.941420153,
0.652994292, 2.941420153, NA, 16.01840668, NA, 16.01840668),
Yr1985 = c(4.340588078, 4.447228937, NA, NA, 3.052618478,
NA, 3.052618478, NA, 16.01840668, NA, 16.01840668), Yr1986 = c(4.340588078,
4.447228937, NA, NA, 0.691807598, NA, 0.691807598, NA, 16.01840668,
NA, 16.01840668), Yr1987 = c(4.340588078, 4.447228937, NA,
NA, NA, NA, NA, NA, 16.01840668, NA, 16.01840668), Yr1988 = c(1.083782142,
1.110408824, NA, NA, NA, NA, NA, NA, 3.999564755, NA, 3.999564755
), Yrs_Exp = c(9.361643836, 8.635616438, 0.41369863, 2, 2.249315068,
2, 2.249315068, 2.665753425, 5.383561644, 5.309589041, 5.383561644
), arth_mean = c(4.175947792, 3.907637331, 20.60898553, 10.67893778,
2.22861541, 10.67893778, 2.22861541, 10.68682084, 12.35656585,
32.89144945, 12.35656585), median = c(4.340588078, 4.447228937,
20.60898553, 15.36541238, 2.941420153, 15.36541238, 2.941420153,
12.98530484, 16.01840668, 16.01840668, 16.01840668), cumulative = c(41.75947792,
35.16873597, 20.60898553, 32.03681335, 6.685846229, 32.03681335,
6.685846229, 42.74728337, 74.13939513, 197.3486967, 74.13939513
), caldate = c("12/31/88", "12/31/88", "12/31/88", "12/31/88",
"12/31/88", "12/31/88", "12/31/88", "12/31/88", "12/31/88",
"12/31/88", "12/31/88"), Age = c(9L, 9L, 9L, 9L, 8L, 9L,
7L, 7L, 10L, 10L, 8L), Month_Res = c(141L, 131L, 86L, 24L,
27L, 24L, 27L, 32L, 91L, 64L, 197L)), class = "data.frame", row.names = c(NA,
-11L))
Use ave to apply function by group with base R.
# Number of IDs per sector
with(dat, ave(ID, sector, FUN = length))
# [1] 1 1 1 6 2 6 2 6 6 6 6
I have the dataset below where each year has a number that represents an exposure:
## ID DOB sector meters Oct Res_FROM Res_TO Exp_FROM
## 1 20100 1979-08-24 H38 6400 W 1979-08-15 1991-05-15 1979-08-24
## 2 20101 1980-05-05 B01 1600 NW 1980-05-15 1991-04-15 1980-05-15
## 3 20102 1979-03-17 H04 1600 SW 1972-06-15 1979-08-15 1979-03-17
## 4 20103 1981-11-30 B09 3200 NE 1982-01-15 1984-01-15 1982-01-15
## 5 20103 1981-11-30 B37 8000 N 1984-01-15 1986-04-15 1984-01-15
## 6 20104 1978-09-01 B09 3200 NE 1982-01-15 1984-01-15 1982-01-15
## Exp_TO Exps_Grp Yr1952 Yr1953 Yr1954 Yr1955 Yr1956 Yr1957 Yr1958 Yr1959
## 1 1988-12-31 fr51>88 NA NA NA NA NA NA NA NA
## 2 1988-12-31 fr51>88 NA NA NA NA NA NA NA NA
## 3 1979-08-15 between NA NA NA NA NA NA NA NA
## 4 1984-01-15 between NA NA NA NA NA NA NA NA
## 5 1986-04-15 between NA NA NA NA NA NA NA NA
## 6 1984-01-15 between NA NA NA NA NA NA NA NA
## Yr1960 Yr1961 Yr1962 Yr1963 Yr1964 Yr1965 Yr1966 Yr1967 Yr1968 Yr1969 Yr1970
## 1 NA NA NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA NA
## Yr1971 Yr1972 Yr1973 Yr1974 Yr1975 Yr1976 Yr1977 Yr1978 Yr1979 Yr1980
## 1 NA NA NA NA NA NA NA NA 5.950991 4.340588
## 2 NA NA NA NA NA NA NA NA NA 2.927725
## 3 NA NA NA NA NA NA NA NA 20.608986 NA
## 4 NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA
## Yr1981 Yr1982 Yr1983 Yr1984 Yr1985 Yr1986 Yr1987 Yr1988
## 1 4.340588 4.340588 4.340588 4.3405881 4.340588 4.3405881 4.340588 1.083782
## 2 4.447229 4.447229 4.447229 4.4472289 4.447229 4.4472289 4.447229 1.110409
## 3 NA NA NA NA NA NA NA NA
## 4 NA 15.365412 16.018407 0.6529943 NA NA NA NA
## 5 NA NA NA 2.9414202 3.052618 0.6918076 NA NA
## 6 NA 15.365412 16.018407 0.6529943 NA NA NA NA
## Yrs_Exp arth_mean median cumulative caldate Age Month_Res
## 1 9.3616438 4.175948 4.340588 41.759478 12/31/88 9 141
## 2 8.6356164 3.907637 4.447229 35.168736 12/31/88 9 131
## 3 0.4136986 20.608986 20.608986 20.608986 12/31/88 9 86
## 4 2.0000000 10.678938 15.365412 32.036813 12/31/88 9 24
## 5 2.2493151 2.228615 2.941420 6.685846 12/31/88 8 27
## 6 2.0000000 10.678938 15.365412 32.036813 12/31/88 9 24
I am wanting to calculate the average exposure for each year and then find out which years had an average exposure that exceeded a value of 4. How would I go about accomplishing this? So my desired output would be a list of each year with the average exposure, and then another output with a list of the years that had averages exceeding a value of 4. Reproducible data below.
dat <- structure(list(ID = c(20100L, 20101L, 20102L, 20103L, 20103L,
20104L, 20104L, 20105L, 20105L, 20106L, 20106L), DOB = c("1979-08-24",
"1980-05-05", "1979-03-17", "1981-11-30", "1981-11-30", "1978-09-01",
"1978-09-01", "1980-12-03", "1980-12-03", "1978-04-25", "1978-04-25"
), sector = c("H38", "B01", "H04", "B09", "B37", "B09", "B37",
"B09", "B09", "B09", "B09"), meters = c(6400L, 1600L, 1600L,
3200L, 8000L, 3200L, 8000L, 3200L, 3200L, 3200L, 3200L), Oct = c("W",
"NW", "SW", "NE", "N", "NE", "N", "NE", "NE", "NE", "NE"), Res_FROM = c("1979-08-15",
"1980-05-15", "1972-06-15", "1982-01-15", "1984-01-15", "1982-01-15",
"1984-01-15", "1980-12-15", "1983-08-15", "1978-04-15", "1983-08-15"
), Res_TO = c("1991-05-15", "1991-04-15", "1979-08-15", "1984-01-15",
"1986-04-15", "1984-01-15", "1986-04-15", "1983-08-15", "1991-03-15",
"1983-08-15", "2000-01-15"), Exp_FROM = c("1979-08-24", "1980-05-15",
"1979-03-17", "1982-01-15", "1984-01-15", "1982-01-15", "1984-01-15",
"1980-12-15", "1983-08-15", "1978-04-25", "1983-08-15"), Exp_TO = c("1988-12-31",
"1988-12-31", "1979-08-15", "1984-01-15", "1986-04-15", "1984-01-15",
"1986-04-15", "1983-08-15", "1988-12-31", "1983-08-15", "1988-12-31"
), Exps_Grp = c("fr51>88", "fr51>88", "between", "between", "between",
"between", "between", "between", "fr51>88", "between", "fr51>88"
), Yr1952 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1953 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1954 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1955 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Yr1956 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Yr1957 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1958 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), Yr1959 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1960 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1961 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1962 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Yr1963 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Yr1964 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1965 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), Yr1966 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1967 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1968 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1969 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Yr1970 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Yr1971 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1972 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), Yr1973 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1974 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1975 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1976 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Yr1977 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Yr1978 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
79.39642441, NA), Yr1979 = c(5.950991161, NA, 20.60898553, NA,
NA, NA, NA, NA, NA, 59.94484924, NA), Yr1980 = c(4.340588078,
2.927724588, NA, NA, NA, NA, NA, 0.758267013, NA, 16.01840668,
NA), Yr1981 = c(4.340588078, 4.447228937, NA, NA, NA, NA, NA,
16.01840668, NA, 16.01840668, NA), Yr1982 = c(4.340588078, 4.447228937,
NA, 15.36541238, NA, 15.36541238, NA, 16.01840668, NA, 16.01840668,
NA), Yr1983 = c(4.340588078, 4.447228937, NA, 16.01840668, NA,
16.01840668, NA, 9.952203009, 6.066203667, 9.952203009, 6.066203667
), Yr1984 = c(4.340588078, 4.447228937, NA, 0.652994292, 2.941420153,
0.652994292, 2.941420153, NA, 16.01840668, NA, 16.01840668),
Yr1985 = c(4.340588078, 4.447228937, NA, NA, 3.052618478,
NA, 3.052618478, NA, 16.01840668, NA, 16.01840668), Yr1986 = c(4.340588078,
4.447228937, NA, NA, 0.691807598, NA, 0.691807598, NA, 16.01840668,
NA, 16.01840668), Yr1987 = c(4.340588078, 4.447228937, NA,
NA, NA, NA, NA, NA, 16.01840668, NA, 16.01840668), Yr1988 = c(1.083782142,
1.110408824, NA, NA, NA, NA, NA, NA, 3.999564755, NA, 3.999564755
), Yrs_Exp = c(9.361643836, 8.635616438, 0.41369863, 2, 2.249315068,
2, 2.249315068, 2.665753425, 5.383561644, 5.309589041, 5.383561644
), arth_mean = c(4.175947792, 3.907637331, 20.60898553, 10.67893778,
2.22861541, 10.67893778, 2.22861541, 10.68682084, 12.35656585,
32.89144945, 12.35656585), median = c(4.340588078, 4.447228937,
20.60898553, 15.36541238, 2.941420153, 15.36541238, 2.941420153,
12.98530484, 16.01840668, 16.01840668, 16.01840668), cumulative = c(41.75947792,
35.16873597, 20.60898553, 32.03681335, 6.685846229, 32.03681335,
6.685846229, 42.74728337, 74.13939513, 197.3486967, 74.13939513
), caldate = c("12/31/88", "12/31/88", "12/31/88", "12/31/88",
"12/31/88", "12/31/88", "12/31/88", "12/31/88", "12/31/88",
"12/31/88", "12/31/88"), Age = c(9L, 9L, 9L, 9L, 8L, 9L,
7L, 7L, 10L, 10L, 8L), Month_Res = c(141L, 131L, 86L, 24L,
27L, 24L, 27L, 32L, 91L, 64L, 197L)), class = "data.frame", row.names = c(NA,
-11L))
You'll have to think about what you want to do with NAs; these solutions just drop them.
Base R solution:
# subset to year columns
dat_years <- dat[, grep("^Yr1", names(dat))]
# compute averages
avg_by_year <- sapply(dat_years, \(col) mean(col, na.rm = TRUE))
# find years w avg > 4, and remove "Yr" prefix
years_gt_4 <- names(avg_by_year)[!is.na(avg_by_year) & avg_by_year > 4] |>
sub(pattern = "Yr", replacement = "")
years_gt_4
# "1978" "1979" "1980" "1981" "1982" "1983" "1984" "1985" "1986" "1987"
tidyverse solution:
library(tidyverse)
avg_by_year <- dat %>%
pivot_longer(
cols = Yr1952:Yr1988,
names_to = "Year",
values_to = "Exposure",
names_prefix = "Yr"
) %>%
group_by(Year) %>%
summarize(Exposure = mean(Exposure, na.rm = TRUE))
years_gt_4 <- avg_by_year %>%
filter(Exposure > 4) %>%
pull(Year)
years_gt_4
# "1978" "1979" "1980" "1981" "1982" "1983" "1984" "1985" "1986" "1987"
I've got a list of around 140 data frames, all of which have different number of rows and columns. The only thing they have in common is that they have sample ID as rownames and years as columns. The years are between 1400-2018 and different data frames have samples in different time periods between those dates. Hence, some columns are matching (i.e. all data frames have values in those years), while others are not (e..g only 1 or 2 dataframes have values in these years). An example of a data frame is shown below:
> dput(shell[[20]])
list(structure(list(`1847` = c(NA, NA, NA, NA, NA, NA, NA, 1.33,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), `1848` = c(NA, NA, NA, NA, NA, NA, 1.86, 1.46, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), `1849` = c(NA,
NA, NA, NA, NA, NA, 1.75, 1.5, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), `1850` = c(NA, NA, NA, NA, NA,
NA, 1.7, 1.23, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
...
I want to merge these data frames so that rownames are kept (i.e. simply pasting them below each other) and values in columns are matched - i.e. if a given sample has values between 1650-1700 these are retained, and for years where that sample has no value there is NA. I also need the years to follow from earliest to present. So, I want it to look like this:
.. 1847 1848 1849 ...
S1 NA NA NA
S2 12.5 3.5 4.5
S3 NA NA 12.5
I've tried doing this with bind_rows and rbind.fill and it works for some columns, but not others - I got some very strange columns (e.g. columns names -150 in the middle of the data).
What could I be doing wrong?
If you convert your data.frames into data.table objects, you can try the following:
library(data.table)
lapply(shell, setDT)
Then, you can use rbindlist function that is very efficient and powerful:
data <- rbindlist(shell, use.names = TRUE, fill = TRUE)
Finally, to reorder the columns, you can use setcolorder. Something in this flavor (you might need to adapt it a little bit) :
years <- as.character(1400:2018)[as.character(1400:2018) %in% colnames(data)]
setcolorder(data, c("sampleID", years))
I am trying to write a function that will uses values from variables stored in different columns to generate a new variable. The logic requires a series of ifelse statements. However, the final statement is always evaluating to true and I don't understand why.
Even more puzzling when I generated data for the MWE then the function works fine. But it still behaves bizarrely with a sample from the real data. I am guessing there is something in my environment that is causing mischief but I am now lost as how to investigate further.
FWIW I have tried writing this function in data.table and now dplyr syntax, and I get similar problems with both approaches.
Simpler functions that don't use ifelse statements seem to behave just fine.
gen_sofa_c <- function(data, map=NA, noradr=NA, dopa=NA, adr=NA, vasopressin=NA) {
library(dplyr)
# Extract the arguments and force conversion to string
pars <- as.list(match.call()[-1])
vasopressin <- as.character(pars$vasopressin)
noradr <- as.character(pars$noradr)
adr <- as.character(pars$adr)
dopa <- as.character(pars$dopa)
map <- as.character(pars$map)
# Default to NA
# if ("sofa_c" %in% names(data)) data$sofa_c <- NULL
# data$sofa_c <- as.numeric(NA)
return(
data %>%
# # Return 0 if MAP >= 70
mutate(sofa_c = ifelse(!is.na(map) & map >= 70, 0 , NA)) %>%
# # Return 1 if MAP < 70
mutate(sofa_c = ifelse(!is.na(map) & map < 70, 1 , sofa_c)) %>%
# # Return SOFA 2 if norad OR adr > 0.0 or dopamine > 5
mutate(sofa_c = ifelse(!is.na(noradr) & noradr > 0.0 , 2 , sofa_c)) %>%
# # Return SOFA 3 if norad OR adr > 0.1 or dopamine > 15
mutate(sofa_c = ifelse(!is.na(noradr) & noradr > 0.1 , 3 , sofa_c)) %>%
# # Return SOFA 4 if on vasopressin
mutate(sofa_c = ifelse(!is.na(vasopressin) & vasopressin > 0, 4 , sofa_c)) %>%
# Return sofa_c
select(sofa_c)
)
}
Here is the simulated data
# Simulate data
set.seed(1234)
tdata <- data.table(map=round(rnorm(100,70,10)), noradr=round(rnorm(100,0,1),2), vasopressin=sample(c(rep(NA,9),1)))
tdata[, noradr := ifelse(noradr < 0, NA, noradr)]
sofa_c <- gen_sofa_c(tdata, map=map, noradr=rx_norad, dopa=rx_dopa, adr=rx_adre, vasopressin=rx_vasopr)
table(sofa_c)
(cbind(tdata, sofa_c))
My output is this
R> table(sofa_c)
sofa_c
0 1 2 3 4
17 27 4 42 10
R> head((cbind(tdata, sofa_c)),10)
map noradr vasopressin sofa_c
1: 58 0.41 NA 3
2: 73 NA NA 0
3: 81 0.07 1 4
4: 47 NA NA 1
5: 74 NA NA 0
6: 75 0.17 NA 3
7: 64 NA NA 1
8: 65 0.17 NA 3
9: 64 0.35 NA 3
10: 61 NA NA 1
Here is the real data (as sample from >2 million rows)
nrow(ddata)
rdata <- ddata[runif(100,1,nrow(ddata)),.(map,norad=rx_norad,vasopressin=rx_vasopr)]
dput(rdata)
rm(sofa_c)
sofa_c <- gen_sofa_c(rdata, map=map, noradr=rx_norad, dopa=rx_dopa, adr=rx_adre, vasopressin=rx_vasopr)
table(sofa_c)
head((cbind(rdata, sofa_c)),10)
Here is the sample from the real data
R> dput(rdata)
structure(list(map = c(80, 82, 76, NA, 87, NA, NA, NA, NA, NA,
NA, NA, NA, 124, 65, 63, NA, 70, NA, NA, NA, NA, NA, NA, NA,
100, NA, NA, NA, NA, 85, 85, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 97, 84, 0, 84, NA, 75, NA, NA, NA, 67, NA, 58, NA, 153,
122, NA, NA, 91, 90, NA, NA, 87, NA, 60, 72, 107, 62, NA, NA,
97, 88, NA, NA, NA, 60, 81, 80, NA, NA, 82, 72, NA, 98, NA, NA,
80, 82, NA, NA, NA, 68, NA, NA, 126, 90, 65, 67, NA), norad = c(NA,
NA, NA, NA, 0, NA, NA, 0.14, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0.18, 0.00952381, NA, NA, 0.12962963, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 0.172222, NA, NA, NA, NA, NA, 0.0623529, NA,
NA, NA, NA, 0.29005848, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.221667,
NA, NA, NA, NA, NA, 0.02, NA, NA, NA, NA, NA, 0.08, NA, NA, NA,
NA, NA, NA, NA, NA, 0.284444444, NA, NA, 0.19, NA, NA, NA, NA,
4, NA, NA), vasopressin = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 2, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("map",
"norad", "vasopressin"), row.names = c(NA, -100L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x101810978>)
Here is the real data output
R> table(sofa_c)
sofa_c
3 4
99 1
R> head((cbind(rdata, sofa_c)),10)
map norad vasopressin sofa_c
1: 80 NA NA 3
2: 82 NA NA 3
3: 76 NA NA 3
4: NA NA NA 3
5: 87 0.00 NA 3
6: NA NA NA 3
7: NA NA NA 3
8: NA 0.14 NA 3
9: NA NA NA 3
10: NA NA NA 3