Creating a dataframe using declared variables - r

I'm trying to create a dataframe to assist in downstream calculations. All "MineralVars" need an NA value in this 1-row dataframe. MineralVars will always change in length and variable names, so I can't create the dataframe by inputting each Mineral one at a time. I need to create it using the string 'MineralVars'.
# Declare variables
MineralVars <- c("Al", "Cu", "Pb")
# Create helper dataframe
Placeholder_df <- data.frame(make = "Placeholder", modelid = "Placeholder",
unitno = "Placeholder", compart = "Placeholder",
MineralVars = NA, meterread = NA, order = NA)
My desired dataframe would look like this;
Desired_df <- data.frame(sampledate = NA, make = "Placeholder", modelid = "Placeholder",
unitno = "Placeholder", compart = "Placeholder", oilchanged = "Placeholder",
Cu = NA, Pb = NA, Al = NA, meterread = NA, order = NA)

We can just do an assignment
Placeholder_df[MineralVars] <- NA
and remove the "MineralVars"
Placeholder_df["MineralVars"] <- NULL
Or use add_column
library(tibble)
library(dplyr)
add_column(Placeholder_df,.before = "MineralVars",
!!!setNames(rep(NA,length(MineralVars)), MineralVars)) %>%
mutate(MineralVars = NULL)
# make modelid unitno compart Al Cu Pb meterread order
#1 Placeholder Placeholder Placeholder Placeholder NA NA NA NA NA
and assign (<-) it either 'Placeholder_df' or a new object

Related

Separate a column into multiple column in the desired way mentioned

I can separate (using ", ") a column into multiple column.
The idea is to reverse the order of words (separated by ", ") and then separate them into multiple columns. Example of reversing - "CA, SF" becomes "SF, CA"
Below is an example
library(tidyverse)
# sample example
tbl <- tibble(
letter = c("US, CA, SF","NYC", "Florida, Miami")
)
# desired result
tbl_desired <- tibble(
country = c("US", NA, NA),
state = c("CA", NA, "Florida"),
city = c("SF", "NYC", "Miami")
)
# please edit it to get the desired result
tbl %>%
# please add line to reverse the string
mutate() %>%
separate(letter, into = c("country", "state", "city"), sep = ", ")
There is fill argument in separate which can be used (by default, it is "warn"), but we can change that to either "right" or "left". Here, it should be filled from the "left"
library(tidyr)
separate(tbl, letter, into = c("country", "state", "city"),
sep = ", ", fill = "left")
-output
# A tibble: 3 × 3
country state city
<chr> <chr> <chr>
1 US CA SF
2 <NA> <NA> NYC
3 <NA> Florida Miami

Error in env_bind_lazy(private$bindings, !!!set_names(promises, names_bindings)) : attempt to use zero-length variable name

I am trying to filter every column of my dataframe with a certain threshold (in this case >= 1.2) with dplyrs filter function. It worked nicely so far, but suddenly I get this error message, when I try to run the code:
Error in env_bind_lazy(private$bindings, !!!set_names(promises, names_bindings)) :
attempt to use zero-length variable name
This is part of my dataframe (it has 108 columns, some rows contain NA):
Mean 1
Mean 2
Mean 3
1.1874
1.0944
1.2376
1.2258
1.0665
1.2365
1.0953
1.1420
1.2479
1.2234
1.0949
1.0608
NA
NA
1.146
This is my code:
Heights_filtered = list()
for (i in 1:length(allHeights)){
filtered = filter(allHeights, allHeights[,i] >=1.2, .preserve = TRUE)
filterlist = cbind.data.frame(filtered[,i])
colnames(filterlist) = colnames(allHeights[i])
Heights_filtered[[i]] = cbind.data.frame(filterlist)
names(Heights_filtered) = colnames(allHeights[i])}
Do you have an idea why this happens now?
Thanks for your help!
These are the first rows of my dataframe
> dput(head(allHeights[1:10], 10))
structure(list(Mean1 = c(1.18743006611931, 1.22582285838843,
1.09595291724188, 1.22341059362058, 1.32431882583739, 1.31219937513623,
1.28004068880331, 1.29884472862021, 1.36733270362566, 1.38170457022452
), Mean2 = c(1.09447069039104, 1.09233667417252, 1.08767127319823,
1.06656658866469, 1.14203717603426, 1.09491221098798, 1.03171589621323,
1.15308990831089, 1.17585765375955, 1.11962264706315), Mean3 = c(1.23761700966768,
1.07486913672867, 1.2605330014152, 1.21512728264762, 1.23659397432181,
1.17488789237668, 1.28191444014391, 1.23137649405787, 1.22165765827209,
1.17481969002029), Mean4 = c(1.0608309164187, 1.06201740178538,
1.07512524012204, 1.07230027496328, 1.07823270179668, 1.08137782967343,
1.08704659309202, 1.09783795999849, 1.05538815021281, 1.04118799201477
), Mean5 = c(1.3872325431161, 1.34236438736957, 1.11657498580741,
1.19758040835503, 1.19718888867138, 1.12759626490222, 1.13074799835562,
1.19262768435683, 1.16498639469099, 1.2131433157802), Mean6 = c(1.18440664423239,
1.20342967777624, 1.21238802071329, 1.12420289186988, 1.22123880207133,
1.19712964243458, 1.20605725349191, 1.23989305305859, 1.21075923108837,
1.24834431998033), Mean7 = c(1.13543425248546, 1.12286625398612,
1.09469483808257, 1.10461963472656, 1.11445916679456, 1.08465067103221,
1.12117801538173, 1.08284306202145, 1.11304377483331, 1.13541719957027
), Mean8 = c(1.24793883159642, 1.19395390601616, 1.18592691355337,
1.19717830807325, 1.191232891622, 1.19336888792142, 1.17576392479116,
1.13564256754918, 1.11424178933907, 1.18585888819352), Mean9 = c(1.20505670697375,
1.18604713515832, 1.19024318309784, 1.21607636002896, 1.30812129661903,
1.24325012735609, 1.19658417567097, 1.27798482451672, 1.04137061962088,
1.30975681690216), Mean10 = c(1.06327665140615, 1.13939757285081,
1.12462757067074, 1.06967153549887, 1.08647627352663, 1.16336022091418,
1.15385873119686, 1.1672116851973, 1.22303975001817, 1.13392922026016
)), row.names = c(NA, 10L), class = "data.frame")
And here the last part of the dataframe which gives me the error:
dput(head(allHeights[100:108], 10))
structure(list(c(1.3975238170743, 1.42479618398277, 1.36302374440084,
1.33075672890157, 1.30214981303101, 1.29526565452359, 1.31860044132609,
1.23876534400972, 1.15907559361002, 1.26664552529697), c(2.22279564798051,
2.15443577725511, 2.36887256975583, 2.04737812822552, 2.21183099544832,
2.08881706966277, NA, NA, NA, NA), c(1.03731717809005, 1.07517206767995,
1.10263120160597, 1.17071264697448, 1.12660596501291, 1.07340120447376,
1.05339833667909, 1.02742328649269, 1.04743332377402, 1.09359764840837
), c(1.75325898322414, 1.80777043843246, 1.26273660420002, 1.59312822030592,
1.11652967053664, 1.62459472912435, 1.28563356786353, 1.95060067533935,
NA, NA), c(1.34261413268355, 1.30548480529631, 1.32490460208726,
1.05392855500896, 1.36887499425314, 1.12776424072456, 1.24322559882304,
1.24394280722725, 1.51098340306193, 1.35122063353409), c(1.30861179458687,
1.30802444638463, 1.32818477656957, 1.2115882212874, 1.27803793951901,
1.34488451464402, 1.2494642431939, 1.14564647987936, 1.13223271688229,
1.21111199301532), c(1.19828142850047, 1.2299458600308, 1.18492028013709,
1.24207768340535, 1.14210500173844, 1.14374410172354, 1.17129836586698,
1.20543386479909, 1.17938210897531, 1.1315377738042), c(1.06870742201506,
1.19744233297478, 1.14709573323772, 1.21291980399187, 1.19923509023545,
1.1095972272021, 1.1777817616828, 1.13757918011235, 1.18910601171268,
1.18139715549181), c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("Mean100",
"Mean101", "Mean102", "Mean103", "Mean104", "Mean105", "Mean106",
"Mean107", NA), row.names = c(NA, 10L), class = "data.frame")
This solution might work and be more inline with a dplyr approach. The code below uses mtcars as an example. It will keep rows where all values are greater than or equal to one.
library(dplyr)
mtcars %>%
filter(
across(
.cols = everything(),
.fns = ~ .x >= 1
)
)
Edit: Depending if you want to handle missing data this should help. You can specify your function you want to use to filter and then apply in in the where statement. Here is another example with mtcars.
myrowfun <- function(x){is.na(x)| x >=2}
mtcars <- mutate(mtcars, mpg = NA)
mtcars$mpg[1:7] <- rep(1,7)
mtcars %>%
filter(myrowfun(
across(
.cols = everything(),
.fns = ~.x
)
)
)
edit: adding an example of how to remove all missing columns
mtcars <- mutate(mtcars, newcol = NA)
#shows which columns are not all missing
sapply(mtcars, function(x)all(!is.na(x)))
#subset on that
mtcars2 <- mtcars[, sapply(mtcars, function(x)all(!is.na(x)))]

How to use reactive values in substr?

I'm having trouble with my shiny app. I want that the user can type in all the variables needed for the substr function to filter data from a data frame using dplyr.
I made an example using the dataframe iris.
In the textInput(select1) I like to type in "Species".
In the numericInput(start1) I like to type in "4".
In the numericInput(end1) I like to type in "6".
In the textInput(match1) I like to type in "osa".
Now I want that the tableOutput only shows the rows which matches the criteria "osa" in the column "Species" from digit 4 to 6.
The numericInput(start1), the numericInput(end1) and the textInput(match1) are working. But the textInput(select1) doesn't work. When I'm using the input as variable I'm getting an empty data frame.
When I change the code an type in "Species" instead of reactivevar1() in the substr function I get the data frame I want.
Example:
filter(substr(Species, reactivevar2(), reactivevar3()) == reactivevar4())
This alternative works. But this is not what I want.
I want this to work:
filter(substr(reactivevar1(), reactivevar2(), reactivevar3()) == reactivevar4())
I tried different functions like substring and stringr::str_sub. I also tried as.character.
This is the full example:
library(shiny)
library(dplyr)
ui = fluidPage(
textInput(inputId="select1", label="Type in variable", value = "", width = NULL, placeholder = NULL),
numericInput(inputId="start1", label="Start digit", value=NULL, min = NA, max = NA, step = NA,
width = NULL),
numericInput(inputId="end1", label="End digit", value=NULL, min = NA, max = NA, step = NA,
width = NULL),
textInput(inputId="match1", label="Criteria to match", value = "", width = NULL, placeholder = NULL),
actionButton(inputId="startfil", label="Start filter", icon = NULL, width = NULL),
tableOutput('table')
)
server = function(input, output,session) {
obs <- observeEvent(input$startfil, {
var1 <- NA
reactivevar1 <- reactive({
var1 <- input$select1
return(var1)})
var2 <- NA
reactivevar2 <- reactive({
var2 <- input$start1
return(var2)})
var3 <- NA
reactivevar3 <- reactive({
var3 <- input$end1
return(var3)})
var4 <- NA
reactivevar4 <- reactive({
var4 <- input$match1
return(var4)})
irisfiltered <- iris %>%
filter(substr(reactivevar1(), reactivevar2(), reactivevar3()) == reactivevar4()) #reactivevar1() doesn't work
output$table <- renderTable(irisfiltered)
})
}
shinyApp(ui = ui, server = server)
I just can't figure out what is wrong with my code. It is important that the user can type in a start and an end digit to filter the substring.
Welcome to SO!
reactivevar1() has the value "Species", so your substr function returns "cie". And
substr(reactivevar1(), reactivevar2(), reactivevar3()) == reactivevar4()
returns FALSE, when you type i.e. "osa" in reactivevar4()
You could use getin your pipe statement like this:
irisfiltered <- iris %>%
filter(substr(get(reactivevar1()), reactivevar2(), reactivevar3()) == reactivevar4())
output$table <- renderTable(irisfiltered)
Or make use of !! and as.name
iris %>%
filter(substr(!!as.name(reactivevar1()), reactivevar2(), reactivevar3()) == reactivevar4())
output$table <- renderTable(irisfiltered)
Hope this helps

dataframe plot_ly zero not aligned

I am a little bit new to R plot_ly and have an issue with a graph.
I have constructed a dataframe with values for different dates depending on the column. My dataframe has dates in the first column and then values for the different dates but not necessarily on all dates. Hence I am using na.locf to remove na between 2 values by the first value (hope I am clear enough). The remaining na are the ones before the first value for each column, which I then replace by 0.
Then I am trying to plot my df with the dates on the x axis and the evolution of my time series on the y axis.
My issue is that somehow there is some difference between 0 values for one of my graph (see screenshot) The orange line begins at 0 and then has some values. If I put the mouse on the graph, before the "big drop", the value is shown to be 0, after it is 0.00. The df has no values for this time serie at this point. Also the graphs does not seem to have the same y axis even if they should and I no not understand why.
My code to create the graph is:
if (dim(df1)[1] != 0){
df1 <- na.locf(df1)
df1[is.na(df1)] <- 0.00
all_names <- colnames(df1)[-1]
for (i in all_names){
if (i==all_names[1]){
p <- plot_ly(x = df1$date, y= df1[,i] , name = i, type = 'scatter', mode = 'lines')
}else{
p <- p %>% add_trace(y = df1[,i], name = i, type = 'scatter', mode = 'lines')
}
}
output$info_graph <- renderPlotly({
p
})
output$info_output <- renderUI({
plotlyOutput("info_graph")
})
}else{
output$info_output <- renderUI({
})
}
EDIT: I have added a screenshot of my data where the gap is (orange line is the third column (before the 2008-09-12 I only have NA), blue one is the second column:
EDIT2: I just reproduced with 26 dates. You can see the screenshot:
dput gives:
structure(list(date = structure(c(14062, 14069, 14076, 14083,
14090, 14097, 14104, 14111, 14118, 14125, 14132, 14134, 14139,
14141, 14146, 14148, 14153, 14155, 14160, 14162, 14167, 14169,
14174, 14176, 14181, 14183), class = "Date"), col1 = c(3036258.57195313,
3023427.6675, 2971520.82675781, 3093997.64199219, 3042965.63564453,
3119076.22796875, 3154652.82667969, 3120534.28529297, 3101871.15154297,
3226680.85849609, 3185563.64195312, NA, 3077375.78849609, NA,
3039466.29806641, NA, 2956357.03058594, NA, 2701488.6103125,
NA, 2715194.34916016, NA, 2687199.64853516, NA, 2733857.48291016,
NA), col2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0, NA, 0, NA, 0, NA, 0, NA, 0, NA, 0, NA, 0, NA, 0)), .Names = c("date",
"col1", "col2"), row.names = 145:170, class = "data.frame")
The problem with your code is that you are plotting character variables which are converted to categorical in the plot call. The culprit is the mis(s)use of na.locf function.
The first column of your data frame is a character column, when you provide the whole data frame to na.locf it converts everything to character. Here is a fix:
library(zoo)
library(plotly)
convert the date column to POSIXct
df1$date <- as.POSIXct(df1$date)
use na.locf only on numerical columns
df1[,2:3] <- na.locf(df1[,2:3])
df1[is.na(df1)] <- 0.00
for (i in all_names){
if (i==all_names[1]){
p <- plot_ly(x = df1$date, y= df1[,i] , name = i, type = 'scatter', mode = 'lines')
}else{
p <- p %>% add_trace(y = df1[,i], name = i, type = 'scatter', mode = 'lines')
}
}
p

Mapping nearest neighbours of a long-lat data set using ggmap, geom_point and a loop

My ultimate goal is to connect all nearest neighbours of a set of buildings (based on Euclidean distance) on a ggmap using geom_path from the ggplot2 package. I need help with a loop that will allow me to plot all neighbours as easily as possible
I have created a distance matrix (called 'kmnew') in kilometres between 3 types of building in Beijing: B (x2), D (x2) and L (x1):
B B D D L
B NA 6.599014 5.758531 6.285787 3.770175
B NA NA 7.141096 3.873296 5.092667
D NA NA NA 3.690725 2.563017
D NA NA NA NA 2.832083
L NA NA NA NA NA
I try to discern the nearest neighbours of each building by row by declaring a matrix and using a loop to ascertain the nearest neighbour building:
nn <- matrix(NA,nrow=5,ncol=1)
for (i in 1:nrow(kmnew)){
nn[i,] <- which.min(kmnew[i,])
}
This returns the following error (not sure why):
Error in nn[i, ] <- which.min(kmnew[i, ]) : replacement has length zero
but seems to return the correct answer to nn:
[,1]
[1,] 5
[2,] 4
[3,] 5
[4,] 5
[5,] NA
I append this to an original dataframe called newbjdata:
colbj <- cbind(newbjdata,nn)
that returns
Name Store sqft long lat nn
1 B 1 1200 116.4579 39.93921 5
2 B 2 750 116.3811 39.93312 4
3 D 1 550 116.4417 39.88882 5
4 D 2 600 116.4022 39.90222 5
5 L 1 1000 116.4333 39.91100 NA
I then retrieve my map via ggmap:
bjgmap <- get_map(location = c(lon = 116.407395,lat = 39.904211),
zoom = 13, scale = "auto",
maptype = "roadmap",
messaging = FALSE, urlonly = FALSE,
filename = "ggmaptemp", crop = TRUE,
color = "bw",
source = "google", api_key)
My ultimate goal is to map the nearest neighbours together in a plot using geom_path from the ggplot package.
For example, the nn of the 1st building of type B (row 1) is the 1 building of type L (row 5). Obviously I can draw this line by subsetting the said 2 rows of the dataframe thus:
ggmap(bjgmap) +
geom_point(data = colbj, aes(x = long,y = lat, fill = factor(Name)),
size =10, pch = 21, col = "white") +
geom_path(data = subset(colbj[c(1,5),]), aes(x = long,y = lat),col = "black")
However, I need a solution that works like a loop, and I can't figure out how one might achieve this, as I need to reference the nn column and refer that back to the long lat data n times. I can well believe that I am not using the most efficient method, so am open to alternatives. Any help much appreciated.
Here is my attempt. I used gcIntermediate() from the geosphere package to set up lines. First, I needed to rearrange your data. When you use gcIntermediate(), you need departure and arrival long/lat. That is you need four columns. In order to arrange your data in this way, I used the dplyr package. mutate_each(colbj, funs(.[nn]), vars = long:lat) works for you to pick up desired arrival long/lat. . is for 'long' and 'lat'. [nn] is the vector index for the variables. Then, I employed gcIntermediate(). This creates SpatialLines. You need to make the object a SpatialLinesDataFrame. Then, you need to convert the output to "normal" data.frame. This step is essential so that ggplot can read your data. fortify() is doing the job.
library(ggmap)
library(geosphere)
library(dplyr)
library(ggplot2)
### Arrange the data: set up departure and arrival long/lat
mutate_each(colbj, funs(.[nn]), vars = long:lat) %>%
rename(arr_long = vars1, arr_lat = vars2) %>%
filter(complete.cases(nn)) -> mydf
### Get line information
rts <- gcIntermediate(mydf[,c("long", "lat")],
mydf[,c("arr_long", "arr_lat")],
50,
breakAtDateLine = FALSE,
addStartEnd = TRUE,
sp = TRUE)
### Convert the routes to a data frame for ggplot use
rts <- as(rts, "SpatialLinesDataFrame")
rts.df <- fortify(rts)
### Get a map (borrowing the OP's code)
bjgmap <- get_map(location = c(lon = 116.407395,lat = 39.904211),
zoom = 13, scale = "auto",
maptype = "roadmap",
messaging = FALSE, urlonly = FALSE,
filename = "ggmaptemp", crop = TRUE,
color = "bw",
source = "google", api_key)
# Draw the map
ggmap(bjgmap) +
geom_point(data = colbj,aes(x = long, y = lat, fill = factor(Name)),
size = 10,pch = 21, col = "white") +
geom_path(data = rts.df, aes(x = long, y = lat, group = group),
col = "black")
EDIT
If you want to do all data manipulation in one sequence, the following is one way to go. foo is identical to rts.df above.
mutate_each(colbj, funs(.[nn]), vars = long:lat) %>%
rename(arr_long = vars1, arr_lat = vars2) %>%
filter(complete.cases(nn)) %>%
do(fortify(as(gcIntermediate(.[,c("long", "lat")],
.[,c("arr_long", "arr_lat")],
50,
breakAtDateLine = FALSE,
addStartEnd = TRUE,
sp = TRUE), "SpatialLinesDataFrame"))) -> foo
identical(rts.df, foo)
#[1] TRUE
DATA
colbj <- structure(list(Name = structure(c(1L, 1L, 2L, 2L, 3L), .Label = c("B",
"D", "L"), class = "factor"), Store = c(1L, 2L, 1L, 2L, 1L),
sqft = c(1200L, 750L, 550L, 600L, 1000L), long = c(116.4579,
116.3811, 116.4417, 116.4022, 116.4333), lat = c(39.93921,
39.93312, 39.88882, 39.90222, 39.911), nn = c(5L, 4L, 5L,
5L, NA)), .Names = c("Name", "Store", "sqft", "long", "lat",
"nn"), class = "data.frame", row.names = c("1", "2", "3", "4",
"5"))

Resources