R-shiny split data and render data table - r

I have a dataframe like below (df1). I am filtering the df1 based on user input and can summarize the data to reflect the total numbers. But based on user input I want to split the data to break it into multiple tables, group and summarize the data in a specific format.I'm not sure if i need to melt the data and then do pivot_wide.
df1<-structure(list(record_id = c(1, 1, 1, 1, 1, 1), Name = c("Anna",
"Anna", "Anna", "Anna", "Anna",
"Anna"), Country = c("USA", "USA",
"USA", "USA", "USA",
"USA"),
record_id.y = c("1", "2", "3", "4", "5",
"6"), emp_id = c("1837100", "203013",
"1820027", "1852508", "2123813",
"1887667"), rel = c("S", "M", "I",
"F", "I", "I"), Date = structure(c(17869,
17862, 17865, 17848, 17862, 17848), class = "Date"), date1 = structure(c(1639134523,
1638615986, 1638764440, 1638876083, 1644605968, 1638764441
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), date2 = structure(c(NA,
NA, NA, NA, 1638615988, NA), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), typec = c(1, 1, 1, 1, 1, 1
), typer = c(0, 0, 0, 0, 0, 0), typey = c(0,
0, 0, 0, 0, 0), is_present = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
)), row.names = c(NA, 6L), class = "data.frame")
My code
table1 <- reactive({
req(input$Name,
input$Country,
input$Status,
input$Type)
my_data %>%
select(Name,
Country,
typec,
typer,
typey,
date1,
date2,
Date,
rel) %>%
filter(Name %in% input$Name,
Date >= input$dates[1] &
Date <= input$dates[2]) %>%
filter(
(('Status1' %in% input$status) & !is.na(date1)) |
(('Status2' %in% input$status) & !is.na(date2))
) %>%
filter(
(('C' %in% input$type) & typec == '1') |
(('R' %in% input$type) & typer == '1') |
(('Y' %in% input$type) & typey == '1')
) %>%
filter(Rel %in% input$rel) %>%
split(input$Name) %>%
group_by(get(input$status,
input$rel,
input$type)) %>%
summarize(Total=n(), .groups = "drop")
})
output$table <- DT::renderDataTable({
datatable(table1())
})
Desired output format
Results for Anna
C R Y
Status1
Status2
Results for Vika
C R Y
Status1
Status2

Related

Visualizing average sentiment by day&year (ggplot)

I would like to visualize consumer sentiment by day&year throughout different years. For example, I am interested in comparing consumer sentiment in Dec 18th of 2011, to Dec 18th in 2012.
Currently, I have been able to do so by month&year, but I want to visualize the data at a more granular level.
#Creating a month-year variable
valences_by_post<- valences_by_post %>%
mutate(month_year = zoo::as.yearmon(date))
#2011 & 2012
valence_11_12<-valences_by_post %>%
filter(year == 2011 | year ==2012)%>%
group_by(month_year) %>%
summarize(mean_valence= mean(valence), n=n())
ggplot(valence_11_12, aes(x =factor(month_year), y = mean_valence, group=1)) +
geom_point() +
geom_line()+
geom_smooth()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
Which produces:
However, to compute sentiment by day&year, and visualize across different years, I ran the following:
valences_by_post<- valences_by_post %>%
mutate(year_day = paste(lubridate::year(date), lubridate::yday(date), sep = "-"))
head(valences_by_post$year_day)
valence_day<-valences_by_post %>%
filter(year == 2011| year == 2012)%>%
group_by(year_day) %>%
summarize(mean_valence= mean(valence), n=n())
And then the graph, but I receive an error that, "Error: Discrete value supplied to continuous scale" because the year_day variable is stored as "character", and I was wondering if there is a workaround for this or an equivalent of the "zoo::as.yearmon(date))" function from other packages?
ggplot(valence_day, aes(x =year_day, y = mean_valence)) +
geom_point() +
geom_line()+
scale_x_continuous(breaks=seq(1,365,1)) +
geom_smooth()
Here are data samples:
dput(head(valence_day,5))
structure(list(year_day = c("2011-175", "2011-176", "2011-177",
"2011-182", "2011-189"), mean_valence = c(0, 0.0806100217864924,
0.0714285714285714, 0, 0.5), n = c(1L, 9L, 1L, 1L, 1L)), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))
And
dput(head(valences_by_post,5))
structure(list(document = c("1", "2", "3", "4", "5"), positive = c(1,
0, 2, 1, 1), negative = c(1, 1, 0, 0, 1), total_words = c(34,
13, 4, 3, 6), valence = c(0, -0.0769230769230769, 0.5, 0.333333333333333,
0), date = structure(c(1308873600, 1308960000, 1308960000, 1308960000,
1308960000), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
year = c(2011, 2011, 2011, 2011, 2011), month = c(6, 6, 6,
6, 6), year_day = c("2011-175", "2011-176", "2011-176", "2011-176",
"2011-176"), month_year = structure(c(2011.41666666667, 2011
IMHO there is no need to add a year_day. Basically this is the same as the date. Hence, you could do your computations by converting your date (which is a datetime object) to a Date . And to show the yearday in the plot this could be achieved via the labels argument of scale_x_date:
library(dplyr)
library(ggplot2)
valence_day <- valences_by_post %>%
filter(year %in% c(2011, 2012)) %>%
group_by(date = as.Date(date)) %>%
summarize(mean_valence = mean(valence), n = n())
ggplot(valence_day, aes(x = date, y = mean_valence)) +
geom_point() +
geom_line() +
scale_x_date(labels = ~ paste(lubridate::year(.x), lubridate::yday(.x), sep = "-")) +
geom_smooth()
DATA
valences_by_post <- structure(list(
document = c("1", "2", "3", "4", "5"), positive = c(
1,
0, 2, 1, 1
), negative = c(1, 1, 0, 0, 1), total_words = c(
34,
13, 4, 3, 6
), valence = c(
0, -0.0769230769230769, 0.5, 0.333333333333333,
0
), date = structure(c(
1308873600, 1308960000, 1308960000, 1308960000,
1308960000
), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
year = c(2011, 2011, 2011, 2011, 2011), month = c(
6, 6, 6,
6, 6
), month_year = structure(c(
2011.41666666667, 2011.41666666667,
2011.41666666667, 2011.41666666667, 2011.41666666667
), class = "yearmon")
), row.names = c(
NA,
5L
), class = "data.frame")

Is there an R function to do multiple correlations rather than rewritting the code block

I'm currently having to subset data in multiple ways to extract the data I need to do a correlation and identify if there is a relationship between the same metals of the groups.
Group1 <- subset(Data_Set, subset = Data_Set$Sample == "1")
Group1A <- subset(Group1, subset = Group1$Sample_Type == "A")
GroupX <- subset(Data_Set, subset = Data_Set$Sample == "X")
GroupX<- subset(GroupX, Sample_ID %in% Group1A$Sample_ID )
cor.test(Group1A$Pb,GroupX$Pb, method = "kendall")
ID is used to match between groups
However, this is very inefficient. I have in total 6 groups which have subgroups (between 1-6) composed of 1-40 samples and I looking to see if there is a relationship between any of these groups and GroupX. Is there a function to speed this up.
The output would be something along the lines of:
Pb T p-value tau
Group1A~GroupX 340 0.001 0.5902
Group1B~GroupX 435 0.03 0.2344
.....
Group6C~GroupX 344 0.001 0.4566
And this would be repeated for 5 other metals
I was thinking a standard correlation matrix but this does correlations between metals within a group.
Thanks!
EDIT: Sample data as requested
structure(list(Sample = c("2", "2", "2", "2", "X", "2", "2",
"2", "2", "2", "2", "X", "2", "2", "5", "5", "5", "5", "5", "X",
"5", "5", "3", "3", "X", "3", "3", "X", "4", "4"), Sample_ID = c("DC001",
"DC001", "DC001", "DC001", "DC001", "DC001", "DC001", "DC002",
"DC002", "DC002", "DC002", "DC002", "DC002", "DC002", "DC003",
"DC003", "DC003", "DC003", "DC003", "DC003", "DC003", "DC003",
"DC004", "DC004", "DC004", "DC005", "DC005", "DC005", "DC006",
"DC006"), Sample_Type = c("A", "D", "E", "F", "X", "I", "J",
"A", "D", "E", "F", "X", "I", "J", "A", "B", "D", "E", "F", "X",
"I", "J", "C", "F", "X", "C", "F", "X", "A", "D"), Co = c(0,
0.204473214269861, 0, 0.50977856054987, 0.262230521160956, 0,
0, 0, 0, 0, 0.465855303428853, 0.229502158969648, 0.214970121592712,
0.588126362402572, 0, 0.0906122639531158, 0.229838105464066,
0, 0.240533898070871, 4.77802122014029, 0.47537095149254, 0.384495379166814,
0.00135414270258444, 0.458235177876183, 0.412977043885698, 0.187579567424379,
0.317854941692133, 0.0271598068567071, 0, 0.293328743450483),
Ni = c(2.32894078024542, 0, 2.75976812547636, 2.35251746719724,
0.351631195258774, 1.25476391714642, 0.0586626807902249,
0, 2.31716731851309, 0, 4.03426936736104, 0.414520597983989,
2.69897385721456, 0.781651988488391, 1.48260693680732, 1.59083944326126,
0.944038748319438, 3.06889126279262, 1.69552165261712, 0.849220149877567,
1.75387912556474, 0, 0.333762199305291, 1.66187141150986,
0.735834552887327, 3.72419677755011, 1.27862769479216, 0.264762516047524,
1.84288031704096, 1.8828793053893), Cu = c(16.6696573471153,
21.377014252538, 16.4581203986139, 6.49438237470201, 1.57054125960644,
5.67180974109468, 23.5835333332964, 38.6483288663375, 15.2589198442198,
21.9746392829346, 7.09307693625389, 0.967127488045321, 6.32542891436958,
16.1173426649179, 11.2222721930992, 8.42093833910001, 11.1332246071585,
16.7442343774396, 10.8140656299147, 14.2632807636599, 5.35502290473828,
7.29141216675894, 2.53789491234011, 16.5791995430022, 1.00648647764661,
26.6313784234462, 0.0413060789264422, 0.656674377606213,
3.98095036332964, 6.17760205144632), Zn = c(76.5281110975817,
2652.50181007495, 1007.00556337852, 206.99812727191, 640.15733114957,
484.221162531697, 3718.61286231799, 131.574098527507, 9826.49966864988,
1827.75831773692, 557.015412652748, 850.519284594127, 955.085171501707,
3039.23169926716, 117.947177178762, 65.7886442827721, 78.1092625035093,
253.691311074245, 980.544294923672, 506.400193234096, 1110.92409209043,
902.659801267825, 284.143460051779, 991.762202132739, 899.71040333897,
1686.99915717559, 27.0835877755038, 956.364728487396, 142.167067778216,
1012.61495002819), As = c(0, 0, 1.91185052013389, 1.32808264279786,
0.141039242323703, 1.74872331719823, 0.1065340816859, 0.812367854870543,
0, 0.797230094696634, 2.38925992872935, 0.305621793073037,
0.664951374730799, 0, 0, 0, 2.52051964809224, 0, 0, 0.392178178336116,
0, 3.08334159340895, 2.32108729394528, 1.62081021652742,
0.171200134084414, 6.19125023716284, 4.43213876523911, 0.289386770990403,
0.313331113399545, 6.41607755268465), Cd = c(8.22465741493669,
22.6126042664945, 34.0150873273517, 13.5844058876617, 5.22665850051452,
24.0465414683255, 109.478598702669, 15.1992477278811, 169.517190223851,
75.2983940524065, 34.5230481628261, 3.75297525105592, 45.6178498733986,
247.435132822196, 2.10793502840313, 1.47647473271431, 0.0848090794945706,
2.98717760781629, 3.13384011407655, 5.31936421369202, 3.73593799828465,
5.36310372449921, 0.298562637256625, 1.82673831232711, 3.78462211601718,
8.0628550389363, 0.138799690323038, 1.32275598609847, 0.285061500560821,
0.635235209786838), Pb = c(0.922803462498185, 5.13959353157866,
1.9525414480789, 0, 2.5902978681043, 1.21865949505257, 7.09067896476338,
0, 3.89524247237658, 0.354938950934777, 2.64634863087263,
0.356658949506862, 1.25701617111933, 4.18799241835111, 0,
0.807369345092201, 0.0263264119388502, 0, 3.32333444396018,
76.7555925603143, 0.613522400825461, 0, 1.72315815094652,
3.21414903849599, 1.03802696495681, 1.73176109371547, 0.72736174943572,
0.23309888503164, 12.8688959655249, 33.2486209089115)), row.names = c(NA,
-30L), class = c("tbl_df", "tbl", "data.frame"))
Here's a solution that get's you exactly what you want. It looks a little long and forbidding but I've tried to make it easy to follow and to expand and bullet it proof to missing observations, and NAs etc.. I also have a brute force solution that does **all* the correlations then removes all those you don't need.
library(dplyr)
library(stringr)
library(purrr)
library(broom)
# made up data set that is similar to yours but with missing rows and NAs
set.seed(2020)
Data_Set <-
data.frame(
Sample = c(rep("X", times = 10), rep("2", times = 20), "X", "2"),
Sample_ID = c(rep(c("DC001", "DC002", "DC003", "DC004", "DC005", "DC006", "DC007", "DC008", "DC009", "DC010"), times = 3), "DC011", "DC012"),
Sample_Type = c(rep("X", times = 10), rep("A", times = 10), rep("D", times = 10), "X", "A"),
Co = runif(32, 0, 5),
Ni = runif(32, 0, 4.1),
Cu = runif(32, 0, 39),
Zn = runif(32, 27, 9800),
As = runif(32, 0, 6),
Cd = runif(32, 0, 247),
Pb = runif(32, 0, 78)
)
Data_Set[15,5] <- NA
# Data_Set
# Collapse Sample and Sample_Type into one Group variable
Data_Set <-
Data_Set %>%
mutate(Group = str_c(Sample, Sample_Type)) %>%
select(Group, everything())
# Pull out Group XX (our baseline) and relabel
# the metals with an _X on the end
JustGroupX <-
Data_Set %>%
filter(Group == "XX") %>%
mutate(Group = "X") %>%
rename(Co_X = Co,
Ni_X = Ni,
Cu_X = Cu,
Zn_X = Zn,
As_X = As,
Cd_X = Cd,
Pb_X = Pb) %>%
select(-Group, -Sample, -Sample_Type)
# a df with no XX
AllNotX <-
Data_Set %>%
filter(Group != "XX")
# Make a list of DF's by Group
ListofGroupDFs <-
AllNotX %>%
split(.$Group)
# glimpse(ListofGroupDFs)
ListofGroupDFs <- map(ListofGroupDFs, ~ inner_join(., JustGroupX, by = "Sample_ID"))
# this part is inelegant since it simply repeats the same code for each metal
# I'll try and make it prettier another day
CoResults <-
map_dfr(ListofGroupDFs,
~ tidy(cor.test(.$Co, .$Co_X, method = "kendall")),
.id = "ComparedwithX") %>%
mutate(Metal = "Co")
CoResults$Metal <- "Co"
NiResults <-
map_dfr(ListofGroupDFs,
~ tidy(cor.test(.$Ni, .$Ni_X, method = "kendall")),
.id = "ComparedwithX") %>%
mutate(Metal = "Ni")
NiResults$Metal <- "Ni"
CuResults <-
map_dfr(ListofGroupDFs,
~ tidy(cor.test(.$Cu, .$Cu_X, method = "kendall")),
.id = "ComparedwithX") %>%
mutate(Metal = "Cu")
CuResults$Metal <- "Cu"
ZnResults <-
map_dfr(ListofGroupDFs,
~ tidy(cor.test(.$Zn, .$Zn_X, method = "kendall")),
.id = "ComparedwithX") %>%
mutate(Metal = "Zn")
ZnResults$Metal <- "Zn"
AsResults <-
map_dfr(ListofGroupDFs,
~ tidy(cor.test(.$As, .$As_X, method = "kendall")),
.id = "ComparedwithX") %>%
mutate(Metal = "As")
AsResults$Metal <- "As"
CdResults <-
map_dfr(ListofGroupDFs,
~ tidy(cor.test(.$Cd, .$Cd_X, method = "kendall")),
.id = "ComparedwithX") %>%
mutate(Metal = "Cd")
CdResults$Metal <- "Cd"
PbResults <-
map_dfr(ListofGroupDFs,
~ tidy(cor.test(.$Pb, .$Pb_X, method = "kendall")),
.id = "ComparedwithX") %>%
mutate(Metal = "Pb")
PbResults$Metal <- "Pb"
MyResults <- rbind(CoResults,
NiResults,
CuResults,
ZnResults,
AsResults,
CdResults,
PbResults)
MyResults <-
MyResults %>%
rename(tau = estimate, T = statistic) %>%
select(Metal,
ComparedwithX,
tau,
T,
p.value)
MyResults
#> # A tibble: 14 x 5
#> Metal ComparedwithX tau T p.value
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 Co 2A 0.0222 23 1
#> 2 Co 2D 0.0667 24 0.862
#> 3 Ni 2A 0.444 26 0.119
#> 4 Ni 2D -0.111 20 0.727
#> 5 Cu 2A -0.2 18 0.484
#> 6 Cu 2D 0.0667 24 0.862
#> 7 Zn 2A 0.289 29 0.291
#> 8 Zn 2D 0.156 26 0.601
#> 9 As 2A -0.0222 22 1
#> 10 As 2D -0.422 13 0.108
#> 11 Cd 2A 0.2 27 0.484
#> 12 Cd 2D -0.0667 21 0.862
#> 13 Pb 2A -0.333 15 0.216
#> 14 Pb 2D 0.2 27 0.484

Create a contingency table with 2 factors from messy data

I have the following data in messy format:
structure(list(com_level = c("B", "B", "B", "B", "A", "A"),
hf_com = c(1, 1, 1, 1, 1, 1),
sal_level = c("2", "3", "1", "2", "1", "4"),
exp_sal = c(NA, 1, 1, NA, 1, NA)),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -6L))
Column com_level is the factor with 2 levels and column hf_com gives the frequency count for that level.
Column sal_level is the factor with 4 levels and column exp_sal gives the frequency count for that level.
I want to create a contingency table similar to this:
structure(list(`1` = c(1L, 2L),
`2` = c(0L, 1L),
`3` = c(0L, 2L),
`4` = c(1L, 0L)),
row.names = c("A", "B"), class = "data.frame")
I have code that works when I want to compare two columns with the same factor:
# 1 step to create table with frequency counts for exp_sal and curr_sal per category of level
cs_es_table <- df_not_na_num %>%
dplyr::count(sal_level, exp_sal, curr_sal) %>%
tidyr::spread(key = sal_level,value = n) %>% # this code spreads on just one key
select(curr_sal, exp_sal, 1, 2, 3, 4, 5, 6, 7, -8) %>% # reorder columns and omit Column 8 (no answer)
as.data.frame()
# step 2- convert cs_es_table to long format and summarise exp_sal and curr_sal frequencies
cs_es_table <- cs_es_table %>%
gather(key, value, -curr_sal,-exp_sal) %>% # crucial step to make data long
mutate(curr_val = ifelse(curr_sal == 1,value,NA),
exp_val = ifelse(exp_sal == 1,value,NA)) %>% #mutate actually cleans up the data and assigns a value to each new column for 'exp' and 'curr'
group_by(key) %>% #for your summary, because you want to sum up your previous rows which are now assigned a key in a new column
summarise_at( .vars = vars(curr_val, exp_val), .funs = sum, na.rm = TRUE)
This code produces this table but just spreads on one key in step 1:
structure(list(curr_val = c(533L, 448L, 237L, 101L, 56L), exp_val = c(179L,
577L, 725L, 401L, 216L)), row.names = c("< 1000 EUR", "1001-1500 EUR",
"2001-3000 EUR", "3001-4000 EUR", "4001-5000 EUR"), class = "data.frame")
Will I need to use pivot_wider as in this example?
Is it possible to use spread on multiple columns in tidyr similar to dcast?
or
tidyr::spread() with multiple keys and values
Any help would be appreciated to compare the two columns with different factors.

Generate column based if other columns are equal

What I want to do is generate a new column in a dataframe that meets these conditions:
dataframe1$var1 == dataframe2$var1 &
dataframe1$var2 == dataframe2$var2 &
dataframe1var3 == dataframe3$var3*
Basically I need to generate a dummy variable that has the value 1 if the conditions are met, and the value 0 if they are not.
I've tried the following code that doesn't work:
dataframe1$NewVar <- ifelse(dataframe1$var1 == dataframe2$var1 &
dataframe1$var2 == dataframe2$var2 & dataframe1$var3 == dataframe2$var3 , 1, 0)
Data
dput(df1)
structure(list(var1 = c("A", "B", "C"), var2 = c("X", "X", "X"
), var3 = c(1, 2, 2)), .Names = c("var1", "var2", "var3"), row.names = c(NA,
-3L), class = "data.frame")
dput(df2)
structure(list(var1 = c("A", "A", "C"), var2 = c("X", "X", "Y"
), var3 = c(1, 1, 1)), .Names = c("var1", "var2", "var3"), row.names = c(NA,
-3L), class = "data.frame")
btw my dataset is not as simple as the example I posted in the pictures.
I don't know if it's relevant but values in my variables (columns) would look like this:
var1: 24000000000
var2: 1234567
var3: 8
You can simply do,
as.integer(rowSums(df1 == df2) == ncol(df1))
#[1] 1 0 0

How to optimize these for loops and function

Problem
I'm building some weather data and need to check and make sure that there are no outliers, values equal to -9999, and no missing days. If any of these conditions are found, I've written a function nearest() which will find the 5 closest stations and compute an inverse distance weighted value, then plug that back into where the condition was found. The problem is that the code works, but it will take a very long time to run. I have over 600 stations and each station takes about 1 hour to compute.
Question
Can this code be optimized to improve computation time? What is the best way to deal with nested for() loops being used this way?
Code
The following code is a very small portion of the data set used as a reproducible example. This obviously runs very fast, but when spread out over the entire data set will take a long time. Notice that in output, row 10 has an NA in the value. When the code is run, that value is replaced.
dput:
db_sid <- structure(list(id = "USC00030528", lat = 35.45, long = -92.4,
element = "TMAX", firstyear = 1892L, lastyear = 1952L, state = "arkansas"), .Names = c("id",
"lat", "long", "element", "firstyear", "lastyear", "state"), row.names = 5L, class = "data.frame")
output <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = c("TMAX", "TMIN", "TMAX",
"TMIN", "TMAX", "TMIN", "TMAX", "TMIN", "TMAX", "TMIN"), year = c(1900,
1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900), month = c(1,
1, 2, 2, 3, 3, 4, 4, 5, 5), day = c(1, 1, 1, 1, 1, 1, 1, 1, 1,
1), date = structure(c(-25567, -25567, -25536, -25536, -25508,
-25508, -25477, -25477, -25447, -25447), class = "Date"), value = c(30.02,
10.94, 37.94, 10.94, NA, 28.04, 64.94, 41, 82.04, 51.08)), .Names = c("id",
"element", "year", "month", "day", "date", "value"), row.names = c(NA,
-10L), class = c("tbl_df", "data.frame"))
newdat <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = structure(c(1L, 2L,
1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("TMAX", "TMIN"), class = "factor"),
year = c("1900", "1900", "1900", "1900", "1900", "1900",
"1900", "1900", "1900", "1900"), month = c("01", "01", "02",
"02", "03", "04", "04", "05", "05", "01"), day = c("01",
"01", "01", "01", "01", "01", "01", "01", "01", "02"), date = structure(c(-25567,
-25567, -25536, -25536, -25508, -25477, -25477, -25447, -25447,
-25566), class = "Date"), value = c(30.02, 10.94, 37.94,
10.94, 28.04, 64.94, 41, 82.04, 51.08, NA)), .Names = c("id",
"element", "year", "month", "day", "date", "value"), row.names = c(NA,
10L), class = "data.frame")
stack <- structure(list(id = c("USC00035754", "USC00236357", "USC00033466",
"USC00032930"), x = c(-92.0189, -95.1464, -93.0486, -94.4481),
y = c(34.2256, 39.9808, 34.5128, 36.4261), value = c(62.06,
44.96, 55.94, 57.92)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"), .Names = c("id", "x", "y", "value"))
station <- structure(list(id = "USC00031632", lat = 36.4197, long = -90.5858,
value = 30.02), row.names = c(NA, -1L), class = c("tbl_df",
"data.frame"), .Names = c("id", "lat", "long", "value"))
nearest() function:
nearest <- function(id, yr, mnt, dy, ele, out, stack, station){
if (dim(stack)[1] >= 1){
ifelse(dim(stack)[1] == 1, v <- stack$value, v <- idw(stack$value, stack[,2:4], station[,2:3]))
} else {
ret <- filter(out, id == s_id & year == yr, month == mnt, element == ele, value != -9999)
v <- mean(ret$value)
}
return(v)
}
for() loops:
library(dplyr)
library(phylin)
library(lubridate)
for (i in unique(db_sid$id)){
# Check for outliers
for(j in which(output$value > 134 | output$value < -80 | output$value == -9999)){
output[j,7] <- nearest(id = j, yr = as.numeric(output[j,3]), mnt = as.numeric(output[j,4]), dy = as.numeric(output[j,5]),
ele = as.character(output[j,2]), out = output)
}
# Check for NA and replace
for (k in which(is.na(newdat$value))){
newdat[k,7] <- nearest(id = k, yr = as.numeric(newdat[k,3]), mnt = as.numeric(newdat[k,4]), dy = as.numeric(newdat[k,5]),
ele = as.character(newdat[k,2]), out = newdat, stack = stack, station = station)
}
}
I'm not sure I understand at all what you're trying to do. For example, the i from the outer for loop is never actually used. Here is some code that I think will be useful to you:
library(plyr)
library(dplyr)
output_summary =
output %>%
filter(value %>% between(-80, 134) ) %>%
group_by(date, element, id) %>%
summarize(mean_value = mean(value))
if (nrow(stack) == 1) fill_value = stack$value else
fill_value = idw(
stack$value,
stack %>% select(x, y, value),
station %>% select(lat, long) )
newdat_filled =
newdat %>%
mutate(filled_value =
value %>%
mapvalues(NA, fill_value) )

Resources