Related
I have the following dataset:
## ID cumulative sector meters DOB
## 1 20100 41.75948 H38 6400 1979-08-24
## 2 20101 35.16874 B01 1600 1980-05-05
## 3 20102 20.60899 H04 1600 1979-03-17
## 4 20103 38.72266 c("B09", "B37") c(3200, 8000) c("1981-11-30", "1981-11-30")
## 5 20104 38.72266 c("B09", "B37") c(3200, 8000) c("1978-09-01", "1978-09-01")
## 6 20105 116.88668 c("B09", "B09") c(3200, 3200) c("1980-12-03", "1980-12-03")
## Oct Res_FROM Res_TO
## 1 W 1979-08-15 1991-05-15
## 2 NW 1980-05-15 1991-04-15
## 3 SW 1972-06-15 1979-08-15
## 4 c("NE", "N") c("1982-01-15", "1984-01-15") c("1984-01-15", "1986-04-15")
## 5 c("NE", "N") c("1982-01-15", "1984-01-15") c("1984-01-15", "1986-04-15")
## 6 c("NE", "NE") c("1980-12-15", "1983-08-15") c("1983-08-15", "1991-03-15")
## Exp_FROM Exp_TO
## 1 1979-08-24 1988-12-31
## 2 1980-05-15 1988-12-31
## 3 1979-03-17 1979-08-15
## 4 c("1982-01-15", "1984-01-15") c("1984-01-15", "1986-04-15")
## 5 c("1982-01-15", "1984-01-15") c("1984-01-15", "1986-04-15")
## 6 c("1980-12-15", "1983-08-15") c("1983-08-15", "1988-12-31")
## Exps_Grp Yr1952 Yr1953 Yr1954 Yr1955 Yr1956 Yr1957 Yr1958
## 1 fr51>88 NA NA NA NA NA NA NA
## 2 fr51>88 NA NA NA NA NA NA NA
## 3 between NA NA NA NA NA NA NA
## 4 c("between", "between") NA NA NA NA NA NA NA
## 5 c("between", "between") NA NA NA NA NA NA NA
## 6 c("between", "fr51>88") NA NA NA NA NA NA NA
## Yr1959 Yr1960 Yr1961 Yr1962 Yr1963 Yr1964 Yr1965 Yr1966 Yr1967 Yr1968 Yr1969
## 1 NA NA NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA NA
## Yr1970 Yr1971 Yr1972 Yr1973 Yr1974 Yr1975 Yr1976 Yr1977 Yr1978 Yr1979
## 1 NA NA NA NA NA NA NA NA NA 5.950991
## 2 NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA 20.608986
## 4 NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA
## Yr1980 Yr1981 Yr1982 Yr1983 Yr1984 Yr1985 Yr1986
## 1 4.340588 4.340588 4.340588 4.340588 4.340588 4.340588 4.3405881
## 2 2.927725 4.447229 4.447229 4.447229 4.447229 4.447229 4.4472289
## 3 NA NA NA NA NA NA NA
## 4 NA NA 15.365412 16.018407 3.594414 3.052618 0.6918076
## 5 NA NA 15.365412 16.018407 3.594414 3.052618 0.6918076
## 6 0.758267 16.018407 16.018407 16.018407 16.018407 16.018407 16.0184067
## Yr1987 Yr1988 Yrs_Exp arth_mean median cumulative.1 Age
## 1 4.340588 1.083782 9.3616438 4.175948 4.340588 41.75948 9
## 2 4.447229 1.110409 8.6356164 3.907637 4.447229 35.16874 8
## 3 NA NA 0.4136986 20.608986 20.608986 20.60899 9
## 4 NA NA 4.2493151 7.744532 3.594414 38.72266 7
## 5 NA NA 4.2493151 7.744532 3.594414 38.72266 10
## 6 16.018407 3.999565 8.0493151 12.987409 16.018407 116.88668 8
From this information, I want to accomplish a couple things:
I want to calculate how many years that each individual (each ID is one individual) had a mean (which is the arth_mean column) greater than 4. Adding this as a column to my dataset would be ideal.
Coming off of number 2, I want to have each year that each individual exceeded 4 for arth_mean listed out for each person, also in a new column for my dataset. So for example, I may have a column that has 1954, 1966, 1967, etc. listed for each ID.
I'm relatively new to R, so any help or suggestions are appreciated. I have no idea how to get my desired output. I've added reproducible data below.
## structure(list(ID = 20100:20109, cumulative = c(41.75947792,
## 35.16873597, 20.60898553, 38.72265958, 38.72265958, 116.8866785,
## 271.4880918, 134.9473309, 35.91709062, 26.06165412), sector = c("H38",
## "B01", "H04", "c(\"B09\", \"B37\")", "c(\"B09\", \"B37\")", "c(\"B09\", \"B09\")",
## "c(\"B09\", \"B09\")", "H02", "B13", "H38"), meters = c("6400",
## "1600", "1600", "c(3200, 8000)", "c(3200, 8000)", "c(3200, 3200)",
## "c(3200, 3200)", "1600", "4800", "6400"), DOB = c("1979-08-24",
## "1980-05-05", "1979-03-17", "c(\"1981-11-30\", \"1981-11-30\")",
## "c(\"1978-09-01\", \"1978-09-01\")", "c(\"1980-12-03\", \"1980-12-03\")",
## "c(\"1978-04-25\", \"1978-04-25\")", "1979-06-22", "1978-10-09",
## "1982-04-26"), Oct = c("W", "NW", "SW", "c(\"NE\", \"N\")", "c(\"NE\", \"N\")",
## "c(\"NE\", \"NE\")", "c(\"NE\", \"NE\")", "SE", "N", "W"), Res_FROM = c("1979-08-15",
## "1980-05-15", "1972-06-15", "c(\"1982-01-15\", \"1984-01-15\")",
## "c(\"1982-01-15\", \"1984-01-15\")", "c(\"1980-12-15\", \"1983-08-15\")",
## "c(\"1978-04-15\", \"1983-08-15\")", "1979-06-15", "1978-10-15",
## "1982-04-15"), Res_TO = c("1991-05-15", "1991-04-15", "1979-08-15",
## "c(\"1984-01-15\", \"1986-04-15\")", "c(\"1984-01-15\", \"1986-04-15\")",
## "c(\"1983-08-15\", \"1991-03-15\")", "c(\"1983-08-15\", \"2000-01-15\")",
## "1991-04-15", "1983-03-15", "1991-04-15"), Exp_FROM = c("1979-08-24",
## "1980-05-15", "1979-03-17", "c(\"1982-01-15\", \"1984-01-15\")",
## "c(\"1982-01-15\", \"1984-01-15\")", "c(\"1980-12-15\", \"1983-08-15\")",
## "c(\"1978-04-25\", \"1983-08-15\")", "1979-06-22", "1978-10-15",
## "1982-04-26"), Exp_TO = c("1988-12-31", "1988-12-31", "1979-08-15",
## "c(\"1984-01-15\", \"1986-04-15\")", "c(\"1984-01-15\", \"1986-04-15\")",
## "c(\"1983-08-15\", \"1988-12-31\")", "c(\"1983-08-15\", \"1988-12-31\")",
## "1988-12-31", "1983-03-15", "1988-12-31"), Exps_Grp = c("fr51>88",
## "fr51>88", "between", "c(\"between\", \"between\")", "c(\"between\", \"between\")",
## "c(\"between\", \"fr51>88\")", "c(\"between\", \"fr51>88\")",
## "fr51>88", "between", "fr51>88"), Yr1952 = c(NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA), Yr1953 = c(NA, NA, NA, NA, NA, NA, NA,
## NA, NA, NA), Yr1954 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## ), Yr1955 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1956 = c(NA,
## NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1957 = c(NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA), Yr1958 = c(NA, NA, NA, NA, NA, NA, NA,
## NA, NA, NA), Yr1959 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## ), Yr1960 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1961 = c(NA,
## NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1962 = c(NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA), Yr1963 = c(NA, NA, NA, NA, NA, NA, NA,
## NA, NA, NA), Yr1964 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## ), Yr1965 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1966 = c(NA,
## NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1967 = c(NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA), Yr1968 = c(NA, NA, NA, NA, NA, NA, NA,
## NA, NA, NA), Yr1969 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## ), Yr1970 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1971 = c(NA,
## NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1972 = c(NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA), Yr1973 = c(NA, NA, NA, NA, NA, NA, NA,
## NA, NA, NA), Yr1974 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## ), Yr1975 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1976 = c(NA,
## NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1977 = c(NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA), Yr1978 = c(NA, NA, NA, NA, NA, NA, 79.39642441,
## NA, 5.655852488, NA), Yr1979 = c(5.950991161, NA, 20.60898553,
## NA, NA, NA, 59.94484924, 28.50091596, 16.41746701, NA), Yr1980 = c(4.340588078,
## 2.927724588, NA, NA, NA, 0.758267013, 16.01840668, 12.90308755,
## 4.387060214, NA), Yr1981 = c(4.340588078, 4.447228937, NA, NA,
## NA, 16.01840668, 16.01840668, 12.90308755, 4.387060214, NA),
## Yr1982 = c(4.340588078, 4.447228937, NA, 15.36541238, 15.36541238,
## 16.01840668, 16.01840668, 12.90308755, 4.387060214, 3.274931595
## ), Yr1983 = c(4.340588078, 4.447228937, NA, 16.01840668,
## 16.01840668, 16.01840668, 16.01840668, 12.90308755, 0.682590481,
## 4.340588078), Yr1984 = c(4.340588078, 4.447228937, NA, 3.594414445,
## 3.594414445, 16.01840668, 16.01840668, 12.90308755, NA, 4.340588078
## ), Yr1985 = c(4.340588078, 4.447228937, NA, 3.052618478,
## 3.052618478, 16.01840668, 16.01840668, 12.90308755, NA, 4.340588078
## ), Yr1986 = c(4.340588078, 4.447228937, NA, 0.691807598,
## 0.691807598, 16.01840668, 16.01840668, 12.90308755, NA, 4.340588078
## ), Yr1987 = c(4.340588078, 4.447228937, NA, NA, NA, 16.01840668,
## 16.01840668, 12.90308755, NA, 4.340588078), Yr1988 = c(1.083782142,
## 1.110408824, NA, NA, NA, 3.999564755, 3.999564755, 3.221714571,
## NA, 1.083782142), Yrs_Exp = c(9.361643836, 8.635616438, 0.41369863,
## 4.249315068, 4.249315068, 8.049315068, 10.69315068, 9.534246575,
## 4.416438356, 6.687671233), arth_mean = c(4.175947792, 3.907637331,
## 20.60898553, 7.744531916, 7.744531916, 12.98740872, 24.68073562,
## 13.49473309, 5.98618177, 3.723093446), median = c(4.340588078,
## 4.447228937, 20.60898553, 3.594414445, 3.594414445, 16.01840668,
## 16.01840668, 12.90308755, 4.387060214, 4.340588078), cumulative.1 = c(41.75947792,
## 35.16873597, 20.60898553, 38.72265958, 38.72265958, 116.8866785,
## 271.4880918, 134.9473309, 35.91709062, 26.06165412), Age = c(9L,
## 8L, 9L, 7L, 10L, 8L, 10L, 9L, 10L, 6L)), class = "data.frame", row.names = c(NA,
## -10L))
If your 10 row dataset above is dat, then you can melt the dataset into a long format (Yr columns), restrict to rows where the value in those year columns exceeds 4, count the number of rows as num_yr_above, and concatenate the year values as years_above.
Then just join back to dat
library(data.table)
setDT(dat)
dat[melt(dat,id="ID",
measure=patterns("Yr\\d"))[value>4, .(
num_yr_above=.N,
years_above = list(gsub("Yr","",variable))),
by=ID],
on=.(ID)]
Output: (all columns included, but here I show only the new additional columns added to the dataset)
ID num_yr_above years_above
1: 20106 10 1978,1979,1980,1981,1982,1983,...
2: 20108 5 1978,1979,1980,1981,1982
3: 20100 9 1979,1980,1981,1982,1983,1984,...
4: 20102 1 1979
5: 20107 9 1979,1980,1981,1982,1983,1984,...
6: 20101 7 1981,1982,1983,1984,1985,1986,...
7: 20105 7 1981,1982,1983,1984,1985,1986,...
8: 20103 2 1982,1983
9: 20104 2 1982,1983
10: 20109 5 1983,1984,1985,1986,1987
I have a table test whose NA values I would like to approximate based on linear interpolation between values that do exist.
For example, the second row plotted looks like this:
v1 <- unlist(test[2,])
plot(v1[!is.na(v1)], names(v1)[!is.na(v1)], type="l", add = TRUE)
How would one go about interpolating/approximating the NA values along the x-axis in this case? Any suggestions in base R or dplyr would be helpful
test
variable 26500 30000 30100 30700 31600 33700 33800 33900 34000 34600 34800 35100 35200 35300
1 -20 NA 0 NA NA 10 20 NA NA NA 30 NA NA NA NA
2 -10 NA 0 NA NA NA 10 NA NA NA 20 NA NA NA 30
3 0 0 NA NA NA NA NA 10 NA NA NA 20 NA NA NA
4 24 NA NA NA 0 NA NA NA NA 10 NA NA NA 20 NA
5 40 NA NA 0 NA NA NA NA 10 NA NA NA 20 NA NA
6 55 NA NA 0 NA NA NA NA 10 NA NA NA 20 NA NA
35400 35600 35800 35900 36200 36300 36400 36700 36900 37000 37200 37800 37900 38000 38200
1 40 NA NA NA 50 NA NA NA NA NA 60 NA NA NA 70
2 NA NA NA 40 NA NA NA 50 NA NA NA 60 NA NA NA
3 NA 30 NA NA 40 NA NA NA 50 NA NA NA 60 NA NA
4 NA NA 30 NA NA 40 NA NA NA 50 NA NA NA 60 NA
5 NA NA 30 NA NA 40 NA NA NA 50 NA NA NA NA 60
6 NA NA NA 30 NA NA 40 NA NA 50 NA NA NA NA 60
38800 39000 39100 39200 39700 39800 39900 40000 40200 40600 40700 40800 41700 41800
1 NA NA NA 80 NA NA NA NA 90 NA NA NA 100 NA
2 70 NA NA NA 80 NA NA NA NA 90 NA NA 100 NA
3 70 NA NA NA NA 80 NA NA NA NA 90 NA 100 NA
4 NA 70 NA NA NA NA NA 80 NA NA NA 90 100 NA
5 NA NA 70 NA NA NA NA 80 NA NA NA 90 NA 100
6 NA 70 NA NA NA NA 80 NA NA NA NA 90 100 NA
Here is the sample data:
dput(test)
structure(list(variable = c(-20, -10, 0, 24, 40, 55), `26500` = c(NA,
NA, 0L, NA, NA, NA), `30000` = c(0L, 0L, NA, NA, NA, NA), `30100` = c(NA,
NA, NA, NA, 0L, 0L), `30700` = c(NA, NA, NA, 0L, NA, NA), `31600` = c(10L,
NA, NA, NA, NA, NA), `33700` = c(20L, 10L, NA, NA, NA, NA), `33800` = c(NA,
NA, 10L, NA, NA, NA), `33900` = c(NA, NA, NA, NA, 10L, 10L),
`34000` = c(NA, NA, NA, 10L, NA, NA), `34600` = c(30L, 20L,
NA, NA, NA, NA), `34800` = c(NA, NA, 20L, NA, NA, NA), `35100` = c(NA,
NA, NA, NA, 20L, 20L), `35200` = c(NA, NA, NA, 20L, NA, NA
), `35300` = c(NA, 30L, NA, NA, NA, NA), `35400` = c(40L,
NA, NA, NA, NA, NA), `35600` = c(NA, NA, 30L, NA, NA, NA),
`35800` = c(NA, NA, NA, 30L, 30L, NA), `35900` = c(NA, 40L,
NA, NA, NA, 30L), `36200` = c(50L, NA, 40L, NA, NA, NA),
`36300` = c(NA, NA, NA, 40L, 40L, NA), `36400` = c(NA, NA,
NA, NA, NA, 40L), `36700` = c(NA, 50L, NA, NA, NA, NA), `36900` = c(NA,
NA, 50L, NA, NA, NA), `37000` = c(NA, NA, NA, 50L, 50L, 50L
), `37200` = c(60L, NA, NA, NA, NA, NA), `37800` = c(NA,
60L, NA, NA, NA, NA), `37900` = c(NA, NA, 60L, NA, NA, NA
), `38000` = c(NA, NA, NA, 60L, NA, NA), `38200` = c(70L,
NA, NA, NA, 60L, 60L), `38800` = c(NA, 70L, 70L, NA, NA,
NA), `39000` = c(NA, NA, NA, 70L, NA, 70L), `39100` = c(NA,
NA, NA, NA, 70L, NA), `39200` = c(80L, NA, NA, NA, NA, NA
), `39700` = c(NA, 80L, NA, NA, NA, NA), `39800` = c(NA,
NA, 80L, NA, NA, NA), `39900` = c(NA, NA, NA, NA, NA, 80L
), `40000` = c(NA, NA, NA, 80L, 80L, NA), `40200` = c(90L,
NA, NA, NA, NA, NA), `40600` = c(NA, 90L, NA, NA, NA, NA),
`40700` = c(NA, NA, 90L, NA, NA, NA), `40800` = c(NA, NA,
NA, 90L, 90L, 90L), `41700` = c(100L, 100L, 100L, 100L, NA,
100L), `41800` = c(NA, NA, NA, NA, 100L, NA)), row.names = c(NA,
-6L), class = "data.frame")
We could use na.interp from forecast
library(forecast)
test[-1] <- t(apply(test[-1], 1, na.interp))
Or with na.approx
test[-1] <- t(apply(test[-1], 1, na.approx, na.rm = FALSE))
then do the plotting
v1 <- unlist(test[2, -1])
plot(v1, names(v1), type = 'l')
If you want to switch easily between different interpolation methods (or time series imputation methods in general) you can also use the imputeTS package.
For the requested solution this would be:
library("imputeTS")
test[-1] <- t(apply(test[-1], 1, na_interpolation, option = "linear"))
Switching to Spline interpolation would look like this:
test[-1] <- t(apply(test[-1], 1, na_interpolation, option = "stine"))
Another option could be Stineman interpolation:
test[-1] <- t(apply(test[-1], 1, na_interpolation, option = "spline"))
Other imputation methods like na_ma (moving average imputation), na_kalman (Kalman smoothing on structural time series models) would be also possible, if you replace the na_interpolation with the specific function (see also GitHub package Readme for a imputation function overview).
This question already has answers here:
Merging two columns into one in R [duplicate]
(7 answers)
Closed 2 years ago.
I have list of lists (same lengths: n). How can I concatenate them, to get one vector of the same input length (n) ? For example I have:
[[1]]
[1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[[2]]
[1] "a" "a" NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[[3]]
[1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA "d" "e" NA NA NA
I want to get:
"a" "a" NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA "d" "e" NA NA NA
I have tried:
Reduce('union',lapply(l,function(x){x$AB}))
But I got:
"a" NA "d" "e"
I want to get a vector with same length as the input l[[1]]$AB. Any ideas?
I believe you're looking for dplyr's coalesce:
library(dplyr)
mylist<-lapply(mylist,as,"character")
do.call(coalesce,mylist)
[1] "a" "a" NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA "d" "e" NA NA NA
NOTE: Using your exact example, the first element on the list is automatically interpreted as logical class because it's all NAs. I added row 2 in the code above to get around that by coercing all rows in the list into a character class.
We can use pmax
do.call(pmax, c(l, na.rm = TRUE))
#[1] "a" "a" NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA "d" "e" NA NA NA
data
l <- list(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), c("a", "a", NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, "d", "e", NA, NA, NA))
You could also perform row operation with conversion to a data.frame :
list_obj <- list(rep(NA, 20), c("a", "a", rep(NA, 18)), c(rep(NA,16), "d", "e", NA, NA))
df_l <- as.data.frame(list_obj)
apply(df_l, 1, FUN = function(x) paste(ifelse(is.na(x), "", x), collapse = ""))
#> [1] "a" "a" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "d" "e" ""
#> [20] ""
You could coerce list to a data frame and use ifelse case handling like this:
apply(as.data.frame(l), 1, function(x) ifelse(all(is.na(x)), NA, x[!is.na(x)]))
# [1] "a" "a" NA NA NA NA NA NA NA NA NA NA NA NA NA NA
# [17] NA NA "d" "e" NA NA NA
Data
l <- list(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), c("a", "a", NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, "d", "e", NA, NA, NA))
I have a matrix and my objective is to find the maximum of each column and then to divide that number by the sum of all values in the row which contains the max of that column. In other words
max(y) / sum of values in the row where y is the max
How would apply this formula to every column in R ?
> the_matrix
Source: local data frame [20 x 10]
type 100 100F 100I 100X 101 102 1028P 103 103D
(fctr) (int) (int) (int) (int) (int) (int) (int) (int) (int)
1 0 NA NA NA NA NA NA NA NA NA
2 0A 2 NA NA NA NA NA NA NA NA
3 0B NA NA NA NA NA NA NA NA NA
4 0C NA NA NA NA NA NA NA NA NA
5 0E NA NA NA NA NA NA NA NA NA
6 0G NA NA NA NA NA NA NA NA NA
7 0O NA NA NA NA NA NA NA NA NA
8 0Z NA NA NA NA NA NA NA NA NA
9 1 2 NA NA NA NA NA NA NA NA
10 1A 3968 NA 214 26 4 289 8 56030 7484
11 1B 172 NA 107 NA NA 2 NA 372 3829
12 1C 584 NA 19 NA NA 1 NA 72951 363
13 1D 27 NA NA NA NA NA NA 365 22
14 1E 27944 16 68 NA NA NA 1 62 12
15 1F 1 NA 1 NA NA 1 NA 368 27
16 1G 4 NA NA NA NA NA NA 7 NA
17 1H 65 NA 6 21 1 6 3 714 59
18 1M NA NA NA NA NA NA NA 1 NA
19 1N NA NA NA NA NA NA NA NA NA
20 1Q NA NA NA NA NA NA NA NA NA
> dput(the_matrix)
structure(list(type = structure(1:20, .Label = c("0", "0A", "0B",
"0C", "0E", "0G", "0O", "0Z", "1", "1A", "1B", "1C", "1D", "1E",
"1F", "1G", "1H", "1M", "1N", "1Q", "1S", "1X", "1Z", "2", "2A",
"2B", "2C", "2D", "2E", "2F", "2G", "2H", "2I", "2J", "2M", "2S",
"2T", "2X", "2Z", "3", "3B", "3C", "3E", "4B", "5H", "8Z", "0H",
"1I", "1R", "2N", "3H", "5D", "0D", "1K", "1P", "1T", "1U", "1V",
"1W", "1Y", "2U", "3A", "4A", "5C", "7H", "9", "0F", "0T", "1J",
"2L", "0W", "2Q", "3G"), class = "factor"), `100` = c(NA, 2L,
NA, NA, NA, NA, NA, NA, 2L, 3968L, 172L, 584L, 27L, 27944L, 1L,
4L, 65L, NA, NA, NA), `100F` = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 16L, NA, NA, NA, NA, NA, NA), `100I` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, 214L, 107L, 19L, NA, 68L, 1L,
NA, 6L, NA, NA, NA), `100X` = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, 26L, NA, NA, NA, NA, NA, NA, 21L, NA, NA, NA), `101` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, 4L, NA, NA, NA, NA, NA, NA, 1L,
NA, NA, NA), `102` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 289L,
2L, 1L, NA, NA, 1L, NA, 6L, NA, NA, NA), `1028P` = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, 8L, NA, NA, NA, 1L, NA, NA, 3L, NA,
NA, NA), `103` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 56030L,
372L, 72951L, 365L, 62L, 368L, 7L, 714L, 1L, NA, NA), `103D` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, 7484L, 3829L, 363L, 22L, 12L,
27L, NA, 59L, NA, NA, NA)), .Names = c("type", "100", "100F",
"100I", "100X", "101", "102", "1028P", "103", "103D"), class = c("tbl_df",
"data.frame"), row.names = c(NA, -20L))
Going step-by-step:
# let's not call a data frame a matrix
real_matrix = as.matrix(the_matrix[, -1])
# max of each column
col_max = apply(real_matrix, 2, max, na.rm = T)
# which row contains the max
col_which_max = apply(real_matrix, 2, which.max)
# row totals
row_total = rowSums(real_matrix, na.rm = T)
# col max divided by row total for corresponding row
col_max / row_total[col_which_max]
Rounded to 3 decimals, this yields the following:
100 100F 100I 100X 101 102 1028P 103 103D
0.994 0.001 0.003 0.000 0.000 0.004 0.000 0.987 0.110
I have a data.frame called RawHM and want, for each row, to evaluate sets of columns defined by the entries in the list AllList, in order to see if there is enough non-NA observations (not less than 2) to keep the column set of entries for that row. If not, the column set entries should be substituted with NA's.
AllList:
> dput(AllList)
structure(list(EGI = c("OO", "PP", "QQ"), Ref = c("RR", "SS",
"TT")), .Names = c("EGI", "Ref"))
RawHM:
> dput(head(RawHM,10))
structure(list(OO = c(2.26128283268031, NA, NA, NA, 3.1189673217816,
2.68131772865193, 1.50542478607416, NA, NA, NA), PP = c(NA, 2.86537733048028,
2.02969026818987, NA, 2.54112005565494, 3.01623803266379, 1.73909499803785,
2.49712237003491, NA, 1.67635525591635), QQ = c(NA, NA, 1.91968060122123,
NA, NA, 2.63463138625395, NA, NA, NA, NA), RR = c(NA, NA, NA,
NA, NA, 1.01488582084669, 1.01944283768403, NA, 1.06329113924051,
NA), SS = c(0.950310559006211, 0.924124326404927, 1.07886334610473,
0.951793999929161, 0.847931452310888, 0.879173290937997, 0.882126364182319,
NA, NA, 0.713085668766746), TT = c(NA, NA, 1.09812749411644,
NA, 0.9994646420402, 1.21090641120118, 1.25090285854196, NA,
NA, NA)), .Names = c("OO", "PP", "QQ", "RR", "SS", "TT"), row.names = c(1L,
2L, 15L, 16L, 23L, 24L, 25L, 30L, 36L, 40L), class = "data.frame")
I have tried by making a function:
func<-function(x)unlist(lapply(AllList,function(y)if(length(na.omit(x[unlist(y)]))<2){rep(NA,length(unlist(y)))} else{x[unlist(y)]}))
And then:
output<-t(apply(RawHM,1,func))
Which works in priciple but doesnt preserve the colnames, which i want to be the same as in the RawHM dataframe. I would prefer to avoid renaming the columns afterwards..
> dput(head(output,10))
structure(c(NA, NA, NA, NA, 3.1189673217816, 2.68131772865193,
1.50542478607416, NA, NA, NA, NA, NA, 2.02969026818987, NA, 2.54112005565494,
3.01623803266379, 1.73909499803785, NA, NA, NA, NA, NA, 1.91968060122123,
NA, NA, 2.63463138625395, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1.01488582084669, 1.01944283768403, NA, NA, NA, NA, NA, 1.07886334610473,
NA, 0.847931452310888, 0.879173290937997, 0.882126364182319,
NA, NA, NA, NA, NA, 1.09812749411644, NA, 0.9994646420402, 1.21090641120118,
1.25090285854196, NA, NA, NA), .Dim = c(10L, 6L), .Dimnames = list(
c("1", "2", "15", "16", "23", "24", "25", "30", "36", "40"
), NULL))
Any help would be very welcome :-)
Regards
Mads
func is a very strange function... funky even!
When you use apply your data gets converted to a matrix from a data.frame. Your function seems to operate differently if it is a data.frame rather than a matrix:
func(RawHM[1,])
EGI.OO EGI.PP EGI.QQ Ref.RR Ref.SS Ref.TT
2.2612828 NA NA NA 0.9503106 NA
func(as.matrix(RawHM)[1,])
EGI1 EGI2 EGI3 Ref1 Ref2 Ref3
NA NA NA NA NA NA
Note that you get different results, and different names!
In any case, the names issue arises from the fact that when you produce the NAs, there are no names, so the result give inconsistent output for apply. To fix this, here is a modification:
func2 <- function(x)unlist(lapply(AllList,function(y)if(length(na.omit(x[unlist(y)]))<2){sapply(y,function(z) NA)} else{x[unlist(y)]}))
t(apply(RawHM,1,func2))
EGI.OO EGI.PP EGI.QQ Ref.RR Ref.SS Ref.TT
1 NA NA NA NA NA NA
2 NA NA NA NA NA NA
15 NA 2.029690 1.919681 NA 1.0788633 1.0981275
16 NA NA NA NA NA NA
23 3.118967 2.541120 NA NA 0.8479315 0.9994646
24 2.681318 3.016238 2.634631 1.014886 0.8791733 1.2109064
25 1.505425 1.739095 NA 1.019443 0.8821264 1.2509029
30 NA NA NA NA NA NA
36 NA NA NA NA NA NA
40 NA NA NA NA NA NA