for loop to find threshold values between different data frames - r

I have 2 data frame with some matching columns (pollutants).
The first data frame contains the observations while the second one contains different thresholds for some pollutants.
Here a small subset of both data frames:
dput(df1)
structure(list(sample = structure(27:76, .Label = c("A_1", "A_2",
"A_LS", "A_PC", "A_PM", "B_1", "B1_1", "B1_2", "B1-8_PC", "B1-8_PM",
"B1_LS", "B1_PC", "B1_PM", "B_2", "B2_1", "B2_2", "B2-8_PC",
"B2-8_PM", "B2_LS", "B2_PC", "B2_PM", "B_LS", "B_PC", "B_PM",
"C_1", "C_2", "C386", "C387", "C388", "C389", "C390", "C391",
"C392", "C393", "C394", "C395", "C396", "C397", "C398", "C399",
"C400", "C401", "C402", "C403", "C404", "C405", "C406", "C407",
"C408", "C409", "C410", "C411", "C412", "C413", "C414", "C415",
"C416", "C417", "C418", "C419", "C420", "C421", "C422", "C423",
"C424", "C425", "C426", "C427", "C428", "C429", "C430", "C431",
"C432", "C433", "C434", "C435", "C436", "C437", "C438", "C439",
"C440", "C441", "C442", "C443", "C444", "C445", "C446", "C447",
"C448", "C449", "C450", "C451", "C452", "C453", "C454", "C455",
"C456", "C457", "C458", "C459", "C460", "C461", "C462", "C463",
"C464", "C465", "C466", "C467", "C468", "C469", "C470", "C471",
"C472", "C473", "C474", "C475", "C476", "C477", "C478", "C479",
"C480", "C481", "C482", "C483", "C484", "C485", "C486", "C487",
"C488", "C489", "C490", "C491", "C492", "C493", "C494", "C495",
"C496", "C497", "C498", "C499", "C500", "C501", "C502", "C503",
"C504", "C505", "C506", "C507", "C508", "C509", "C510", "C511",
"C512", "C513", "C514", "C515", "C516", "C517", "C518", "C519",
"C520", "C521", "C522", "C523", "C524", "C-8_PC", "C-8_PM", "D_1",
"D_2", "E_1", "E_2", "F_1", "F_2"), class = "factor"), As = c(9,
8.75, 13.5, 7.75, 7.6, 8.33, 8, 8.75, 7.4, 8.25, 8.17, 7.75,
7.6, 7.5, 7.2, 8, 7.83, 7.75, 7, 7.5, 8.17, 8.75, 6.67, 7, 5.83,
6.75, 5.6, 6.4, 6.2, 6.2, 6.2, 6.25, 7, 6, 6, 6.4, 6, 5.8, 5.6,
6, 5.8, 7.25, 8.8, 8.5, 8, 8.25, 8.25, 8.5, 8.25, 8.25), Al = c(30245,
38060, 36280, 24355, 27776, 35190, 38733.8, 36400, 29624, 33699.75,
32163.33, 30645.75, 31373, 26647.5, 19987.6, 32210, 27158, 24220.25,
18598.5, 23081.75, 29393, 26800.5, 22581.67, 29290, 29651.67,
20947.5, 19762.6, 23815, 32784.8, 20696.2, 26880.6, 25087.75,
19497.2, 21794, 32232, 24253.4, 20034, 21270, 22510, 15170.25,
8956.6, 21612.25, 35828, 30006.25, 27128.75, 25835, 31118.75,
35614.5, 37440.25, 33736.75), Hg = c(0.25, 0.35, 0.48, 1.03,
1.12, 0.2, 1.14, 0.4, 2, 0.48, 0.85, 0.18, 0.76, 0.4, 0.48, 0.35,
0.32, 0.33, 0.4, 0.13, 0.15, 0.13, 0.87, 0.12, 0.03, 0.33, 0.2,
0.22, 0.04, 0.16, 0.1, 0.18, 0.11, 0.08, 0.03, 0.06, 0.06, 0.1,
0.03, 0.07, 0.03, 0.1, 0.08, 0.11, 0.1, 0.13, 0.08, 0.12, 0.07,
0.09)), .Names = c("sample", "As", "Al", "Hg"), row.names = c(NA,
50L), class = "data.frame")
and
dput(df2)
structure(list(As = c(25L, 32L), Hg = c(0.4, 0.8), Cr = c(100L,
360L), Element = structure(c(1L, 3L), .Label = c("LCB", "LCB_pelite",
"LCL"), class = "factor")), .Names = c("As", "Hg", "Cr", "Element"
), row.names = c(NA, -2L), class = "data.frame")
Actually the original data frames are bigger, but this subset gives the idea.
What I want now is to put in a 3rd data frames the values of each element of the first df that exceed the threshold values contained in the second df.
Be aware that there are 2 different threshold values (for each element) in df2 and df2 has some element not matched in df1 (for example Cr).
I've tried to write a for loop but I was able to do that just for 1 element at a time:
for (i in df2$As) {
print(length(which(df1$As > i)))
}
I've also tried to use nested for loops but without success..

I'm pretty sure this does not look good, but I think it works. I added some extra lines to match only the elements found in both data frames, which in this case is only 1. It might ned some changes for your full data:
df1.2 <- rbind(df1, df1) #Duplicate the df1 to compare to each threshold value
df1.2 <- df1.2[order(df1.2$sample),] #Order by sample again
cols2 <- na.omit(match(colnames(df1), colnames(df2)))[[1]] #Get the columns of df2 which are in df1
cols1 <- na.omit(match(colnames(df2), colnames(df1)))[[1]] #Get the columns of df1 which are in df2
df2.2 <- df2[rep(1:2, nrow(df1)),cols2] #Replicates df2 the number of times to allow matching the thresholds to each sample, once for each threshold
exceeds <- df1.2[,cols1]>df2.2 #Make the comparions and return a boolean
sum(exceeds) #You will need colSums() for more than one column
With your sample data it's also not clear from the answer which elements ir refers to, but this shouldn't happen if more than one element matches and your result is a matrix.
Maybe there's a more elegant way without replicating the dataframes and having to worry about number of element matches.

df3=data.frame(Pollutant="Z",LCB=0,LCL=0,stringsAsFactors=FALSE)
for (p in names(df1)[-1]) {
if(p %in% names(df2)[1:(length(df2)-1)]) {
df3 = rbind(df3,c(p,sum(df1[p]>df2[[p]][1]),sum(df1[p]>df2[[p]][2])))
}
}
df3=df3[-1,]
df3
Update:
Ah, each new row is rbound as a character vector. To finish up:
str(df3)
df3$LCB=as.numeric(df3$LCB)
df3$LCL=as.numeric(df3$LCL)
str(df3)

How about this?
foo <- function(x, y) {
sapply(x, function(i) sum(y>i))
}
cols = c("As", "Hg")
mapply(foo, df2[cols], df1[cols])
# As Hg
# [1,] 0 10
# [2,] 0 6
Convert this to a data.frame if necessary.

Related

Find the 3 nearest neighbours (dist()?) and calculate mean in new column

This is a sample of the data
structure(list(Season = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("2018/2019",
"2019/2020"), class = "factor"), Date2 = structure(c(17860, 17888,
17916, 17940, 17945, 17952, 17953, 17954, 17978, 17999, 18005,
18188, 18209, 18223, 18237, 18320, 18322, 18334, 18447, 18476
), class = "Date"), HT.av.points = c(0.57, 1.5, 1.67, 1.8, 1.09,
2.18, 1.42, 1.45, 1.79, 1.35, 1.14, 1.83, 2, 1.17, 1.88, 1.83,
1.33, 0.92, 1.31, 1.06), AT.av.points = c(1.14, 2.33, 0.56, 1.2,
1.09, 1.6, 1.08, 1.9, 1.17, 0.9, 1.38, 0.67, 2.14, 1.33, 0.62,
1.08, 2.17, 1.38, 0.56, 0.94), HT_av.PointsTotal = c(0.86, 1.16,
1.18, 1.23, 0.86, 1.86, 1.2, 1.18, 1.5, 1.1, 1.07, 1.46, 1.6,
1.08, 1.75, 1.4, 1.16, 0.92, 1.03, 0.97), AT_av.PointsTotal = c(2.07,
2.21, 0.76, 1.42, 1.59, 1.5, 1.2, 1.91, 1.65, 1.43, 1.38, 0.54,
1.87, 1.58, 0.8, 1.6, 2.32, 1.42, 1.12, 1.32), DIFF.AV.POINTS.PREDICTION = c(-0.28,
-0.43, 0.51, 0.52, -0.36, 0.56, 0.28, -0.38, -0.2, 0.03, -0.43,
1.24, -0.32, -0.29, 1.44, 0.28, -0.85, -0.38, 1.01, 0.22), Over2.5G = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1)), row.names = c(NA,
-20L), .internal.selfref = <pointer: 0x1ca2448>, class = c("data.table",
"data.frame"), .Names = c("Season", "Date2", "HT.av.points",
"AT.av.points", "HT_av.PointsTotal", "AT_av.PointsTotal", "DIFF.AV.POINTS.PREDICTION",
"Over2.5G"))
What I want to do:
group by Season
After the group by, I want to find the 3 previous rows that are most similar (according to the following columns) to the current row.
(HT.av.points, AT.av.points, HT_av.PointsTotal, AT_av.PointsTotal, DIFF.AV.POINTS.PREDICTION)
I guess the dist() function is a possibility.
Finally I want to create a new column with the mean of the values of the Over2.5G column of those 3 most similar rows.
New column:
First 3 rows(of the Season) NAs.
In fourth row(of the Season) the 3 nearest neighbours (and their Over2.5G values) will always be the first 3 rows.
breaking below code up:
a helper function which returns row indices of nearest neighbours with a ready-made function, e.g. get.knn of package FNN
calling this function for increasingly large slices (from row one to current) of the input data df and storing the result as an extra column
extracting the row indices as integers from the result string to index the desired column of the input data for the aggregation (mean, in your case)
here we go:
## helper function returns row indices of nearest 3 neighbours
## as comma-separated string
find_nearest_predecessors <- function(df, ...){
ifelse(nrow(df) < 4, ## can't calculate n neighbours for <n rows:
paste(1:3, collapse = ','),
## otherwise = if sufficient rows,
## get row indices of 3 nearest neighbours:
get.knn(data = df,
k = 3,
algo = 'CR'
) %>%
.[['nn.index']] %>%
tail(1) %>% paste(collapse = ',')
)
}
## df being your input data:
df %>%
mutate(rownum = row_number()) %>%
rowwise %>%
mutate(nearest_neighbours = find_nearest_predecessors(
df = ## use previous data up to current row:
slice(df, 1:rownum) %>%
## choose features/dimensions of distance:
select(HT.av.points, AT.av.points, HT_av.PointsTotal,
AT_av.PointsTotal, DIFF.AV.POINTS.PREDICTION)
),
## calculate mean of OVER2.5G
mean_Over2.5G = mean(df$Over2.5G[
strsplit(nearest_neighbours,',') %>%
unlist %>% as.integer
], na.rm = TRUE)
)

Create subset of data using conditions from another data frame

I would like to use something like dplyr to create a subset of data from one data frame using conditions from another data frame. So in one data frame I have a set of data with minimum and maximum years and other sea-level data lsp , and in another frame I have a time series of ocean dynamics. For each row in the lsp dataframe, I would like to extract every year between the minimum and maximum ages in the dynamics data frame and create a sub set of data. I think this will require a for loop. Does anyone have any idea if this is possible?
Desired output using row 1 of LSP as an example:
Row 1 LSP (simplified) is:
Age min
Age max
1997
2007
I want to use this information to create a data frame like this from the dynamics file:
Subset
Year
Dynamics
1997
125
1998
109
1999
152
2000
161
2001
106
2002
120
2003
58
2004
68
2005
110
2006
144
2007
100
Many thanks
## LSP data
structure(list(Depth = c(0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5,
8.5, 10.5, 13.5, 14.5, 18.5, 19.5, 27.5, 28.5, 32, 35.5, 40.5,
41.5), RSL = c(0.03, 0.03, 0.01, 0.01, -0.04, -0.01, -0.03, 0,
0.04, 0.03, 0, -0.01, -0.05, -0.07, -0.19, -0.24, -0.31, -0.31,
-0.27, -0.29), RSL_err_1sig = c(0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
0.1), Age_mean = c(2001.754499, 1994.278776, 1987.678949, 1980.805889,
1973.270485, 1965.018421, 1957.442729, 1952.134369, 1949.031929,
1945.148184, 1939.132213, 1936.957531, 1927.311071, 1924.379033,
1897.26123, 1892.977317, 1876.1995, 1858.135589, 1825.967544,
1820.605298), Age.min = c(1996.752238, 1985.111654, 1977.483594,
1968.26211, 1961.886124, 1958.219318, 1947.496532, 1943.084044,
1941.761439, 1935.843414, 1923.952516, 1920.057048, 1906.228232,
1902.242998, 1875.327613, 1869.925103, 1834.992176, 1811.928966,
1784.998245, 1767.524866), Age.max = c(2006.75676, 2003.445898,
1997.874304, 1993.349668, 1984.654846, 1971.817524, 1967.388926,
1961.184694, 1956.302419, 1954.452954, 1954.31191, 1953.858014,
1948.39391, 1946.515068, 1919.194847, 1916.029531, 1917.406824,
1904.342212, 1866.936843, 1873.68573)), class = "data.frame", row.names = c(NA,
-20L))
## Dynamics (only head)
structure(list(Year = 1815:1820, dynamics = c(-76.01893261, -64.50519732,
-66.06270761, -76.22822397, -72.35960029, -77.34157443)), row.names = c(NA,
6L), class = "data.frame")
Here is a base R option with Map and subset -
Map(function(x, y) subset(dynamics, Year >= x & Year <= y),
LSP$Age.min, LSP$Age.max)
The same logic can be implemented using tidyverse functions as well.
library(dplyr)
library(purrr)
map2(LSP$Age.min, LSP$Age.max, ~dynamics %>% filter(Year >= .x & Year <= .y))
As long as your dataset isn't huge, I would take something like the following approach.
Add the (nested) dynamics dataset to each row of your lsp dataset
Unnest the dynamics dataset to get one row per year
Filter out years that aren't relevant
(Optional)
Renest the dynamics columns to you have one row per lsp record with a tibble for all relevant years from the dynamics set.
lsp %>%
add_column(dynamics %>% nest(data = everything())) %>%
unnest(data) %>%
filter(year >= min & year <= max) %>%
nest(filtered = c(year, value))
I guess this does what you want to do. First assign names to your input data, so later you know what my codes mean.
lsp <- structure(list(Depth = c(0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5,
8.5, 10.5, 13.5, 14.5, 18.5, 19.5, 27.5, 28.5, 32, 35.5, 40.5,
41.5), RSL = c(0.03, 0.03, 0.01, 0.01, -0.04, -0.01, -0.03, 0,
0.04, 0.03, 0, -0.01, -0.05, -0.07, -0.19, -0.24, -0.31, -0.31,
-0.27, -0.29), RSL_err_1sig = c(0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
0.1), Age_mean = c(2001.754499, 1994.278776, 1987.678949, 1980.805889,
1973.270485, 1965.018421, 1957.442729, 1952.134369, 1949.031929,
1945.148184, 1939.132213, 1936.957531, 1927.311071, 1924.379033,
1897.26123, 1892.977317, 1876.1995, 1858.135589, 1825.967544,
1820.605298), Age.min = c(1996.752238, 1985.111654, 1977.483594,
1968.26211, 1961.886124, 1958.219318, 1947.496532, 1943.084044,
1941.761439, 1935.843414, 1923.952516, 1920.057048, 1906.228232,
1902.242998, 1875.327613, 1869.925103, 1834.992176, 1811.928966,
1784.998245, 1767.524866), Age.max = c(2006.75676, 2003.445898,
1997.874304, 1993.349668, 1984.654846, 1971.817524, 1967.388926,
1961.184694, 1956.302419, 1954.452954, 1954.31191, 1953.858014,
1948.39391, 1946.515068, 1919.194847, 1916.029531, 1917.406824,
1904.342212, 1866.936843, 1873.68573)), class = "data.frame", row.names = c(NA,
-20L))
dynamics <- structure(list(Year = 1815:1820, dynamics = c(-76.01893261, -64.50519732,
-66.06270761, -76.22822397, -72.35960029, -77.34157443)), row.names = c(NA,
6L), class = "data.frame")
Then the actual codes to get the subset.
# first get info of years from the "lsp" dataset
# following your example in your comments
year_min <- list()
year_max <- list()
all_years <- list()
for(i in 1:nrow(lsp)){
year_min[[i]] <- round(lsp$Age.min[[i]])
year_max[[i]] <- round(lsp$Age.max[[i]])
all_years[[i]] <- c(year_min[[i]]:year_max[[i]])
all_years[[i]] <- as.data.frame(all_years[[i]])
colnames(all_years[[i]]) <- "Year"
}
# now join the info on "Year" from "lsp" data with "dynamics" data to get the subset
library(dplyr)
subset_output <- list()
for (i in 1:length(all_years)){
subset_output[[i]] <- left_join(dynamics,all_years[[i]])
}

Removing NAs from ggplot x-axis in ggplot2

I would like to get rid off the whole NA block (highlighted here ).
I tried na.ommit and na.rm = TRUE unsuccesfully.
Here is the code I used :
library(readxl)
data <- read_excel("Documents/TFB/xlsx_geochimie/solfatara_maj.xlsx")
View(data)
data <- gather(data,FeO:`Fe2O3(T)`,key = "Element",value="Pourcentage")
library(ggplot2)
level_order <- factor(data$Element,levels = c("SiO2","TiO2","Al2O3","Fe2O3","FeO","MgO","CaO","Na2O","K2O"))
ggplot(data=data,mapping=aes(x=level_order,y=data$Pourcentage,colour=data$Ech)+geom_point()+geom_line(group=data$Ech) +scale_y_log10()
And here is my original file
https://drive.google.com/file/d/1bZi7fPWebbpodD1LFScoEcWt5Bs-cqhb/view?usp=sharing
If I run your code and look at data that goes into ggplot:
table(data$Element)
Al2O3 CaO Fe2O3 Fe2O3(T) FeO K2O LOI LOI2 MgO MnO
12 12 12 12 12 12 12 12 12 12
Na2O P2O5 SiO2 SO4 TiO2 Total Total 2 Total N Total S
12 12 12 12 12 12 12 12 12
You have included Total into the melted data frame.. which is not intended I guess. Hence when you do factor on these, and these "Total.." are not included in the levels, they become NA.
So we can do it from scratch:
data <- read_excel("solfatara_maj.xlsx")
The data:
structure(list(Ech = c("AGN 1A", "AGN 2A", "AGN 3B", "SOL 4B",
"SOL 8Ag", "SOL 8Ab", "SOL 16A", "SOL 16B", "SOL 16C", "SOL 22 A",
"SOL 22D", "SOL 25B"), FeO = c(0.2, 0.8, 1.7, 0.3, 1.7, NA, 0.2,
NA, 0.1, 0.7, 1.3, 2), `Total S` = c(5.96, 45.3, 0.22, 17.3,
NA, NA, NA, NA, NA, NA, 2.37, 0.36), SO4 = c(NA, 6.72, NA, 4.08,
0.06, 0.16, 42.2, 35.2, 37.8, 0.32, 6.57, NA), `Total N` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, 15.2, NA, NA), SiO2 = c(50.2,
31.05, 56.47, 62.14, 61.36, 75.66, 8.41, 21.74, 17.44, 13.52,
19.62, 56.35), Al2O3 = c(15.53, 7.7, 17.56, 4.44, 17.75, 10.92,
31.92, 26.38, 27.66, 0.64, 3.85, 17.28), Fe2O3 = c(0.49, 0.63,
2.06, NA, 1.76, 0.11, 0.64, 0.88, 1.71, NA, 1.32, 2.67), MnO = c(0.01,
0.01, 0.13, 0.01, 0.09, 0.01, 0.01, 0.01, 0.01, 0.005, 0.04,
0.12), MgO = c(0.06, 0.07, 0.88, 0.03, 0.97, 0.05, 0.04, 0.07,
0.03, 0.02, 1.85, 1.63), CaO = c(0.2, 0.09, 3.34, 0.09, 2.58,
0.57, 0.2, 0.26, 0.15, 0.06, 35.66, 4.79), Na2O = c(0.15, 0.14,
3.23, 0.13, 3.18, 2.04, 0.68, 0.68, 0.55, 0.05, 0.45, 3.11),
K2O = c(4.39, 1.98, 8, 1.26, 8.59, 5.94, 8.2, 6.97, 8.04,
0.2, 0.89, 7.65), TiO2 = c(0.42, 0.27, 0.46, 0.79, 0.55,
0.16, 0.09, 0.22, 0.16, 0.222, 0.34, 0.53), P2O5 = c(0.11,
0.09, 0.18, 0.08, 0.07, 0.07, 0.85, 0.68, 0.62, NA, 0.14,
0.28), LOI = c(27.77, 57.06, 6.13, 29.03, 1.38, 4.92, 42.58,
37.58, 38.76, NA, 26.99, 3.92), LOI2 = c(27.79, 57.15, 6.32,
29.06, 1.57, 4.93, 42.6, 37.59, 38.77, 0.08, 27.13, 4.15),
Total = c(99.52, 99.88, 100.2, 98.25, 99.99, 100.5, 93.81,
95.57, 95.23, 15.25, 92.45, 100.3), `Total 2` = c(99.54,
99.96, 100.3, 98.28, 100.2, 100.6, 93.83, 95.58, 95.24, 15.33,
92.59, 100.6), `Fe2O3(T)` = c(0.71, 1.52, 3.95, 0.27, 3.65,
0.22, 0.87, 0.99, 1.82, 0.61, 2.76, 4.9)), row.names = c(NA,
-12L), class = c("tbl_df", "tbl", "data.frame"))
First we set the plotting level like you did:
plotlvls = c("SiO2","TiO2","Al2O3","Fe2O3","FeO","MgO","CaO","Na2O","K2O")
Then we select only these columns, and also Ech, note I use pivot_longer() because gather() will supposedly be deprecated, and then we do the factoring too:
plotdf = data %>% select(c(plotlvls,"Ech")) %>%
pivot_longer(-Ech,names_to = "Element",values_to = "Pourcentage") %>%
mutate(Element=factor(Element,levels=toplot))
Finally we plot, and there are no NAs:
ggplot(data=plotdf,mapping=aes(x=Element,y=Pourcentage,colour=Ech))+
geom_point()+geom_line(aes(group=Ech)) +scale_y_log10()
1.Create reproducible minimal data
data <- data.frame(Element = c("SiO2","TiO2","Al2O3","Fe2O3","FeO","MgO","CaO","Na2O","K2O",NA),
Pourcentage = 1:10,
Ech = c("AGN 1A", "SOL 16"))
2.Set factor levels for variable 'Element'
data$Element <- factor(data$Element,levels = c("SiO2","TiO2","Al2O3","Fe2O3","FeO","MgO","CaO","Na2O","K2O"))
3.Remove rows containing NA in the variable 'Element'
data <- data[!is.na(data$Element), ]
4.Plot data using ggplot2 (ggplot2 syntax uses NSE (non standard evaluation), which means you dont't have to pass the variable names as strings or using the $ notation):
ggplot(data=data,aes(x=Element,y=Pourcentage,colour=Ech)) +
geom_point() +
geom_line(aes(group=Ech)) +
scale_y_log10()

How to get original row index after extracting and working with a subset of a data frame

I have a data frame
df<-structure(list(time = structure(c(1080868500, 1080868800, 1080869100,
1080869400, 1080869700, 1080870000, 1080870300, 1080870600, 1080870900,
1080871200, 1080871500, 1080871800, 1080872100, 1080872400, 1080872700,
1080873000, 1080873300, 1080873600, 1080873900, 1080874200, 1080874500,
1080874800, 1080875100, 1080875400, 1080875700, 1080876000, 1080876300,
1080876600, 1080876900, 1080877200, 1080877500, 1080877800, 1080878100,
1080878400, 1080878700, 1080879000, 1080879300, 1080879600, 1080879900,
1080880200, 1080880500, 1080880800, 1080881100, 1080881400, 1080881700,
1080882000, 1080882300, 1080882600, 1080882900, 1080883200), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), precip = c(1.76, 1.76, 1.21, 0.78,
0.59, 0.59, 0.62, 0.62, 0.81, 0.81, 1.14, 0.82, 0.87, 1.03, 0.98,
0.77, 0.77, 0.45, 0.55, 0.82, 0.8, 0.58, 0.7, 0.7, 1.03, 1.25,
1.32, 1.68, 2.6, 1.49, 3.85, 3.91, 2.94, 3.63, 4.12, 1.85, 2.02,
3.46, 3.45, 2.53, 2.88, 3, 2.42, 1.56, 1.44, 1.43, 1.33, 1.27,
1.35, 1.4)), .Names = c("time", "precip"), row.names = 236752:236801, class = "data.frame")
I want to find the maximum value that exists between rows 10 and 20. But I want to find the original index of that maxim value.
which.max(df[10:20,]$precip)
gives me the index of 2. I know I can add it to row index 10. But is there a proper way to do it?
It is simply (10:20)[2].
Now consider a more complicated case:
set.seed(0)
index <- sample(1:nrow(df), 10) ## a random subset of size 10
pos <- which.max(df[index,"precip"]) ## position in the subset data frame
# [1] 5
index[pos] ## position in the original data frame
# [1] 42
Thanks. Can you explain why this is a more complicated case? Looks the same as my case.
In your case where index = 10:20, which is consecutive and sorted increasingly, you may add 10 + 2 = 12. But in my case where index is not consecutive (even not sorted):
# [1] 45 14 18 27 42 10 40 41 28 26
there is no way you can do an addition to get row number in original data frame.
This seems a reasonably straightforward method using logical indexing with two conditions each focussed on the 10th to 20th rows:
df[rownames(df) %in% rownames(df)[10:20] & df$precip == max(df$precip[10:20]), ]
#--------------
time precip
236762 2004-04-02 02:05:00 1.14
If you wnat just the rowname "236762" you could just wrap rownames() around that dataframe value. You can index dataframes with rownames and itn this case you would see:
df["236762" , ] # note the need for quoting. The name is a character value.
#
# time precip
#236762 2004-04-02 02:05:00 1.14

How can I stop the overwriting of lagged columns?

Reference: https://stackoverflow.com/a/28056113/3942806
I am using the code from the link above to create lagged values for two columns.
n<-4
odd2<-setDT(odd)[, paste("OBS_Q", 1:n) := shift(OBS_Q, 1:n)]
odd2<-setDT(odd)[, paste("sac", 1:n) := shift(sac, 1:n)]
This works great! I get 18 columns.
But for convenience, I tried to convert it into a function:
masterlag<-function(df,col,n){
setDT(df)[, paste(col,sep='_',1:n) := shift(df[[col]], 1:n)]
}
odd3<-masterlag(df=odd,col="OBS_Q",n=4)
odd3<-masterlag(df=odd,col="sac",n=4)
But in this case, the newly created columns of the first one ('OBS_Q') are getting replaced when I used the function the second time ('sac'). So, I am only left with 14 columns instead of 18.
Any pointers as to why?
odd<-structure(list(DATE = 19630101:19630104, PRECIP = c(0, 0, 0,0),
OBS_Q = c(1.61, 1.48, 1.4, 1.33), swb = c(1.75, 1.73, 1.7,1.67),
gr4j = c(1.9, 1.77, 1.67, 1.58), isba = c(0.83, 0.83,0.83, 0.83),
noah = c(1.31, 1.19, 1.24, 1.31), sac = c(1.99,1.8, 1.66, 1.57),
swap = c(1.1, 1.05, 1.08, 0.99), vic.mm.day. = c(2.1,1.75, 1.55, 1.43)),
.Names = c("DATE", "PRECIP", "OBS_Q", "swb","gr4j", "isba", "noah", "sac", "swap", "vic.mm.day."),
class = c("data.table","data.frame"), row.names = c(NA, -4L))
The dataframes are self-updating
odd<-masterlag(df=odd,col="OBS_Q",n=4)
odd<-masterlag(df=odd,col="sac",n=4)

Resources