skip the empty dataframes and produce the output - r

sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1
locale:
[1] LC_COLLATE=German_Germany.1252 LC_CTYPE=German_Germany.1252
[3] LC_MONETARY=German_Germany.1252 LC_NUMERIC=C
[5] LC_TIME=German_Germany.1252
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] WriteXLS_3.5.1 tidyr_0.2.0 scales_0.2.4 gridExtra_0.9.1
[5] ggplot2_1.0.1 RPostgreSQL_0.4 DBI_0.3.1
loaded via a namespace (and not attached):
[1] Rcpp_0.11.6 assertthat_0.1 dplyr_0.4.1 digest_0.6.8
[5] MASS_7.3-40 plyr_1.8.2 gtable_0.1.2 magrittr_1.5
[9] stringi_0.4-1 lazyeval_0.1.10 reshape2_1.4.1 proto_0.3-10
[13] tools_3.2.0 stringr_1.0.0 munsell_0.4.2 parallel_3.2.0
[17] colorspace_1.2-6
#
library(RPostgreSQL)
library(ggplot2)
library(gridExtra)
library(scales)
library(tidyr)
blue.bold.italic.16.text <- element_text(face = "bold", color = "black", size = 12)
#
Consider four machines running in parllel and producing products. And the each dataframe(l1,l2,l3,l4) below represents no of pieces per hour for each machine (actually I collect data from database using RPostgreSQL and these is sample how it looks)
l1 <- structure(list(hours = structure(c(1434081600, 1434085200, 1434088800,
1434092400, 1434096000, 1434099600, 1434103200, 1434106800, 1434110400,
1434114000, 1434117600, 1434121200, 1434124800, 1434128400, 1434132000,
1434135600, 1434139200, 1434142800, 1434146400, 1434150000, 1434153600,
1434157200, 1434160800, 1434164400), class = c("POSIXct", "POSIXt"
), tzone = ""), count = c(25, 29, 28, 32, 33, 13, 33, 29, 32,
33, 27, 34, 25, 30, 13, 24, 26, 33, 40, 34, 26, 30, 22, 30)), .Names = c("hours",
"count"), row.names = c(NA, 24L), class = "data.frame")
l2 <- structure(list(hours = structure(c(1434081600, 1434085200, 1434088800,
1434092400, 1434096000, 1434099600, 1434103200, 1434106800, 1434110400,
1434114000, 1434117600, 1434121200, 1434124800, 1434128400, 1434132000,
1434135600, 1434139200, 1434142800, 1434146400, 1434150000, 1434153600,
1434157200, 1434160800, 1434164400), class = c("POSIXct", "POSIXt"
), tzone = ""), count = c(25, 29, 28, 32, 33, 13, 33, 29, 32,
33, 27, 34, 25, 30, 13, 24, 26, 33, 40, 34, 26, 30, 22, 30)), .Names = c("hours",
"count"), row.names = c(NA, 24L), class = "data.frame")
l3 <- structure(list(hours = structure(c(1434081600, 1434085200, 1434088800,
1434092400, 1434096000, 1434099600, 1434103200, 1434106800, 1434110400,
1434114000, 1434117600, 1434121200, 1434124800, 1434128400, 1434132000,
1434135600, 1434139200, 1434142800, 1434146400, 1434150000, 1434153600,
1434157200, 1434160800, 1434164400), class = c("POSIXct", "POSIXt"
), tzone = ""), count = c(25, 29, 28, 32, 33, 13, 33, 29, 32,
33, 27, 34, 25, 30, 13, 24, 26, 33, 40, 34, 26, 30, 22, 30)), .Names = c("hours",
"count"), row.names = c(NA, 24L), class = "data.frame")
l4 <- structure(list(hours = structure(c(1434081600, 1434085200, 1434088800,
1434092400, 1434096000, 1434099600, 1434103200, 1434106800, 1434110400,
1434114000, 1434117600, 1434121200, 1434124800, 1434128400, 1434132000,
1434135600, 1434139200, 1434142800, 1434146400, 1434150000, 1434153600,
1434157200, 1434160800, 1434164400), class = c("POSIXct", "POSIXt"
), tzone = ""), count = c(25, 29, 28, 32, 33, 13, 33, 29, 32,
33, 27, 34, 25, 30, 13, 24, 26, 33, 40, 34, 26, 30, 22, 30)), .Names = c("hours",
"count"), row.names = c(NA, 24L), class = "data.frame")
#
here is my script for the attached plot(output)
df <- merge(l1,l2, by="hours")
df <- merge(df,l3, by="hours")
df <- merge(df,l4, by="hours")
colnames(df) <- c("hours","L 1","L 2","L 3","L 4")
pd <- gather(df, 'Ls', 'count', 2:5)
q <- ggplot(pd, aes(x = hours, y = count)) + geom_bar(stat = "identity") + theme(legend.position = "none")+
xlab("Time") + ylab("No.Of Pecies") +
ggtitle("my sample")+
scale_y_continuous(breaks=seq(0,45, by = 5))+
theme(axis.text = blue.bold.italic.16.text) +
scale_x_datetime(breaks=date_breaks("2 hour"),minor_breaks=date_breaks("2 hour"),labels=date_format("%H")) +
theme(axis.text.x=element_text(angle=0))+
facet_grid(~ Ls)
# when all the 4 machines are working - everything is fine, i will run the above script and i will get the rquired output.
Incase if any machine is not working and i have a dataframe with empty rows..then i will get an error while running my script file.
# df <- merge(l1,l2, by="hours")
df <- merge(df,l3, by="hours")
df <- merge(df,l4, by="hours")
Error in fix.by(by.y, y) : 'by' must specify a uniquely valid column
and the next error at
pd <- gather(df, 'Ls', 'count', 2:5)
how to avoid the empty dataframes and run the script succesfully to produce the output with whatever the no of machines are operating (either it is 2 or 3 or 4)

Judging from the error message, the data.frame that causes the error has neither rows nor columns, it seems to be NULL. So the easiest way would be to check for that situation and if the data.frame is NULL, create a a dummy that can be merge()d and gather()ed.
What I would do (not saying this is the best way) is
# for easier looping, put your data.frames in a list
l <- list( l1, l2, l3, l4 )
# create a dummy that mimics the structure of your data.frames
dummy <- structure( list( hours = structure( c( Sys.time() ),
class = c( "POSIXct", "POSIXt" ), tzone = ""),
count = c(0)), .Names = c("hours", "count"),
row.names = c(NA, 1L), class = "data.frame")
# check for empty data.frames and replace with dummy (will be NA)
for( i in 1:4 ) if( length( l[[ i ]] ) == 0 ) l[[ i ]] <- dummy
# merge
for( i in 2:4 ) l[[ 1 ]] <- merge( l[[ 1 ]], l[[ i ]],
by = "hours", all = TRUE )
# remove dummy and go back to your code
df <- l[[ 1 ]][ 1:24, ]
colnames( df ) <- c( "hours","L 1","L 2","L 3","L 4" )
There is room for improvement but at least it should display the results, whether or not a machine is operating:
l2 <- NULL

One alternative would be to skip the merging all together and go right to stacking the datasets. You would just need to add the Ls column to each individual dataset first.
l1$Ls = "L 1"
l2$Ls = "L 2"
l3$Ls = "L 3"
l4$Ls = "L 4"
Then you could use, e.g., bind_rows from dplyr to make your long dataset pd.
bind_rows(l1, l2, l3, l4)
Source: local data frame [96 x 3]
hours count Ls
1 2015-06-11 21:00:00 25 L 1
2 2015-06-11 22:00:00 29 L 1
3 2015-06-11 23:00:00 28 L 1
4 2015-06-12 00:00:00 32 L 1
5 2015-06-12 01:00:00 33 L 1
6 2015-06-12 02:00:00 13 L 1
7 2015-06-12 03:00:00 33 L 1
8 2015-06-12 04:00:00 29 L 1
9 2015-06-12 05:00:00 32 L 1
10 2015-06-12 06:00:00 33 L 1
.. ... ... ...
The positive of this approach is that one of the objects you bind can be an empty data.frame or NULL and it still works.
Example empty data.frame:
l4.2 = data.frame()
bind_rows(l1, l2, l3, l4.2)
Source: local data frame [72 x 3]
hours count Ls
1 2015-06-11 21:00:00 25 L 1
2 2015-06-11 22:00:00 29 L 1
3 2015-06-11 23:00:00 28 L 1
4 2015-06-12 00:00:00 32 L 1
5 2015-06-12 01:00:00 33 L 1
6 2015-06-12 02:00:00 13 L 1
7 2015-06-12 03:00:00 33 L 1
8 2015-06-12 04:00:00 29 L 1
9 2015-06-12 05:00:00 32 L 1
10 2015-06-12 06:00:00 33 L 1
.. ... ... ...
Example NULL:
l4.3 = NULL
bind_rows(l1, l2, l3, l4.3)
Source: local data frame [72 x 3]
hours count Ls
1 2015-06-11 21:00:00 25 L 1
2 2015-06-11 22:00:00 29 L 1
3 2015-06-11 23:00:00 28 L 1
4 2015-06-12 00:00:00 32 L 1
5 2015-06-12 01:00:00 33 L 1
6 2015-06-12 02:00:00 13 L 1
7 2015-06-12 03:00:00 33 L 1
8 2015-06-12 04:00:00 29 L 1
9 2015-06-12 05:00:00 32 L 1
10 2015-06-12 06:00:00 33 L 1
.. ... ... ...

Related

Joining two R dataframes on multiple columns with one column having slightly different values

I have two dataframes in R that I'm trying to join together, but one of the columns has values that are off by one or two (specifically the yardline_100 column in each). Below is the code that I'm using to join the two:
fin_df <- df1 %>%
left_join(df2,
by = c("posteam" = "posteam",
"qtr" = "qtr",
"down" = "down",
"yardline_100" = "yardline_100"))
Is there any way to make it so that they join even if that one column is off by one or two? You'll notice that the last two values rows have different numbers in that column. Below are samples of the dataframes:
df1 <- structure(list(play_id = c(4596, 4629, 4658, 4682, 4723, 4766,
4790, 4828, 4849, 4878, 4899, 4938), posteam = c("MIN", "MIN",
"MIN", "MIN", "MIN", "CIN", "CIN", "CIN", "CIN", "CIN", "CIN",
"CIN"), qtr = c(5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), yardline_100 = c(63,
58, 55, 50, 38, 61, 55, 52, 52, 20, 15, 15), down = c(2, 1, 2,
3, 1, 1, 2, 3, 4, 1, 2, 3)), row.names = c(NA, -12L), class = c("nflverse_data",
"tbl_df", "tbl", "data.table", "data.frame"), nflverse_timestamp = structure(1659046255.35538, class = c("POSIXct",
"POSIXt")), nflverse_type = "play by play", nflfastR_version = structure(list(
c(4L, 3L, 0L, 9020L)), class = c("package_version", "numeric_version"
)), .internal.selfref = <pointer: 0x0000021967f81ef0>)
df2 <- structure(list(posteam = c("MIN", "MIN", "MIN", "MIN", "MIN",
"CIN", "CIN", "CIN", "CIN", "CIN", "CIN", "CIN"), qtr = c(5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), yardline_100 = c(63, 58, 55,
50, 38, 61, 55, 53, 52, 20, 16, 16), down = c(2, 1, 2, 3, 1,
1, 2, 3, 4, 1, 2, 3), play_id_SR = c("a9f97fb0-1407-11ec-ae9a-d77d9ecb2022",
"d49d54d0-1407-11ec-ae9a-d77d9ecb2022", "e8f74ad0-1407-11ec-ae9a-d77d9ecb2022",
"0208ae60-1408-11ec-ae9a-d77d9ecb2022", "257fd030-1408-11ec-ae9a-d77d9ecb2022",
"fe058030-1408-11ec-ae9a-d77d9ecb2022", "0da68200-1409-11ec-ae9a-d77d9ecb2022",
"26a5bd20-1409-11ec-ae9a-d77d9ecb2022", "70eacce0-1409-11ec-ae9a-d77d9ecb2022",
"99e5fb10-1409-11ec-ae9a-d77d9ecb2022", "a7646b00-1409-11ec-ae9a-d77d9ecb2022",
"de2683d0-1409-11ec-ae9a-d77d9ecb2022")), row.names = c(NA, -12L
), class = c("tbl_df", "tbl", "data.frame"))
An option is to use fuzzyjoin.
library(fuzzyjoin)
df1 %>%
fuzzy_left_join(
df2,
by = c("posteam", "qtr", "down", "yardline_100"),
match_fun = list(`==`, `==`, `==`, function(x, y) abs(x - y) <= 2)) %>%
select(-matches("(posteam|qtr|down).y")) %>%
rename_with(~str_remove(.x, "(?<=(posteam|qtr|down)).x"))
## A tibble: 12 x 7
# play_id posteam qtr yardline_100.x down yardline_100.y play_id_SR
# <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
# 1 4596 MIN 5 63 2 63 a9f97fb0-1407-11ec-ae9a-d77d9ecb2022
# 2 4629 MIN 5 58 1 58 d49d54d0-1407-11ec-ae9a-d77d9ecb2022
# 3 4658 MIN 5 55 2 55 e8f74ad0-1407-11ec-ae9a-d77d9ecb2022
# 4 4682 MIN 5 50 3 50 0208ae60-1408-11ec-ae9a-d77d9ecb2022
# 5 4723 MIN 5 38 1 38 257fd030-1408-11ec-ae9a-d77d9ecb2022
# 6 4766 CIN 5 61 1 61 fe058030-1408-11ec-ae9a-d77d9ecb2022
# 7 4790 CIN 5 55 2 55 0da68200-1409-11ec-ae9a-d77d9ecb2022
# 8 4828 CIN 5 52 3 53 26a5bd20-1409-11ec-ae9a-d77d9ecb2022
# 9 4849 CIN 5 52 4 52 70eacce0-1409-11ec-ae9a-d77d9ecb2022
#10 4878 CIN 5 20 1 20 99e5fb10-1409-11ec-ae9a-d77d9ecb2022
#11 4899 CIN 5 15 2 16 a7646b00-1409-11ec-ae9a-d77d9ecb2022
#12 4938 CIN 5 15 3 16 de2683d0-1409-11ec-ae9a-d77d9ecb2022
Note the matching function function(x, y) abs(x - y) <= 2 for column "yardline_100".
The last two lines (select(...) and rename_with(...)) are necessary to remove the duplicate columns: fuzzyjoin seems to create duplicate (i.e. ".x" and ".y"-suffixed) columns even on exact matches; the last two commands remove these duplicate exact match columns.

How to remove rows with 0 in numeric columns in R [duplicate]

This question already has answers here:
Extracting columns having greater than certain values in R dataframe
(5 answers)
Select columns that don't contain any NA value in R
(3 answers)
Closed 2 years ago.
i have the following Dataset:
structure(list(Species = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label =
c("Bream", "Parkki", "Perch", "Pike", "Roach", "Smelt", "Whitefish"),
class = "factor"),
WeightGRAM = c(242, 290, 340, 363, 430, 450), VertLengthCM = c(23.2,
24, 23.9, 26.3, 26.5, 26.8), DiagLengthCM = c(25.4, 26.3,
26.5, 29, 29, 29.7), CrossLengthCM = c(30, 31.2, 31.1, 33.5,
34, 34.7), HeightCM = c(11.52, 12.48, 12.3778, 12.73, 12.444,
13.6024), WidthCM = c(4.02, 4.3056, 4.6961, 4.4555, 5.134,
4.9274)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
I am trying to check for "0" or negative values in the numeric columns and remove them.
I have the following code:
fish_data <- fish_data [which(rowSums(fish_data) > 0), ]
But i will get a error message:
Error in rowSums(fish_data) : 'x' must be numeric
I roughly guess because my "species" columns are factor, this message came up.
Can i know how can i skip the first column and ask R to check for only numeric columns for "0" or negative values?
Here is a way that keeps only the columns with no values less than or equal to zero.
keep <- sapply(fish_data, function(x) {
if(is.numeric(x)) all(x > 0) else TRUE
})
fish_data[keep]
## A tibble: 6 x 7
# Species WeightGRAM VertLengthCM DiagLengthCM CrossLengthCM HeightCM WidthCM
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Bream 242 23.2 25.4 30 11.5 4.02
#2 Bream 290 24 26.3 31.2 12.5 4.31
#3 Bream 340 23.9 26.5 31.1 12.4 4.70
#4 Bream 363 26.3 29 33.5 12.7 4.46
#5 Bream 430 26.5 29 34 12.4 5.13
#6 Bream 450 26.8 29.7 34.7 13.6 4.93
Using dplyr we can use select to select columns where all values are greater than 0 or are not numeric.
library(dplyr)
df %>% select(where(~(is.numeric(.) && all(. > 0)) || !is.numeric(.)))
# A tibble: 6 x 7
# Species WeightGRAM VertLengthCM DiagLengthCM CrossLengthCM HeightCM WidthCM
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Bream 242 23.2 25.4 30 11.5 4.02
#2 Bream 290 24 26.3 31.2 12.5 4.31
#3 Bream 340 23.9 26.5 31.1 12.4 4.70
#4 Bream 363 26.3 29 33.5 12.7 4.46
#5 Bream 430 26.5 29 34 12.4 5.13
#6 Bream 450 26.8 29.7 34.7 13.6 4.93
In the previous version of dplyr, we can use select_if :
df %>% select_if(~(is.numeric(.) && all(. > 0)) || !is.numeric(.))
you only need to specifiy the columns for the rowSums() function:
fish_data <- fish_data[which(rowSums(fish_data[,2:7]) > 0), ]
note that rowsums sums all values across the row im not sure if thats whta you really want to achieve?
you can check the output of rowsums with:
> rowSums(fish_data[,2:7])
[1] 336.1400 388.2856 438.5739 468.9855 537.0780 559.7298
Thanks all, i think i figure out.
i should be keying:
fish_data[fish_data <= 0] <- NA #convert records with less than or equal to 0 to NA
fish_data <- na.omit(fish_data) # delete rows with NA
But i will get a warning message:
Warning message: In Ops.factor(left, right) : ‘<=’ not meaningful for
factors
# Option 1: (Safer because will retain rows containing NAs)
# Subset data.frame to not contain any observations with 0 values:
# data.frame => stdout (console)
df[rowMeans(df != 0, na.rm = TRUE) == 1,]
# Option 2: (More dangerous because it will remove all rows containing
# NAs) subset data.frame to not contain any observations with 0 values:
# data.frame => stdout (console)
df[complete.cases(replace(df, df == 0, NA)),]
# Option 3 (Variant of Option 1):
# Subset data.frame to not contain any observations with 0 values:
# data.frame => stdout (console)
df[rowMeans(Vectorize(function(x){x != 0})(df[,sapply(df, is.numeric)]),
na.rm = TRUE) == 1,]
# Option 4: Using Higher-order functions:
# Subset data.frame to not contain any observations with 0 values:
# data.frame => stdout (console)
df[Reduce(function(y, z){intersect(y, z)},
Map(function(x){which(x > 0)}, df[,sapply(df, is.numeric)])), ]
# Option 5 tidyverse:
# Subset data.frame to not contain any observations with 0 values:
# data.frame => stdout (console)
library(dplyr)
df %>%
filter_if(is.numeric, all_vars(. > 0))
Data:
df <- structure(list(Species = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label =
c("Bream", "Parkki", "Perch", "Pike", "Roach", "Smelt", "Whitefish"),
class = "factor"),
WeightGRAM = c(242, 290, 340, 363, 0, 450), VertLengthCM = c(23.2,
24, 23.9, 26.3, 26.5, 26.8), DiagLengthCM = c(25.4, 26.3,
26.5, 29, 29, 29.7), CrossLengthCM = c(30, 31.2, 31.1, 33.5,
34, 34.7), HeightCM = c(11.52, 0, 12.3778, 12.73, 12.444,
13.6024), WidthCM = c(4.02, 4.3056, 4.6961, 4.4555, 5.134,
4.9274)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))

What is the best way to expand the xts object and fill with NA?

Say i have the following xts object. How do I and what is the best way to expand 20 more rows and fill all the entries of the new rows with NA ?
structure(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, -0.626453810742332,
0.183643324222082, -0.835628612410047, 1.59528080213779, 0.329507771815361,
-0.820468384118015, 0.487429052428485, 0.738324705129217, 0.575781351653492,
-0.305388387156356, 1.51178116845085, 0.389843236411431, -0.621240580541804,
-2.2146998871775, 1.12493091814311, -0.0449336090152309, -0.0161902630989461,
0.943836210685299, 0.821221195098089, 0.593901321217509, 0.918977371608218,
0.782136300731067, 0.0745649833651906, -1.98935169586337, 0.61982574789471,
-0.0561287395290008, -0.155795506705329, -1.47075238389927, -0.47815005510862,
2.83588312039941, 4.71735910305809, 1.79442454531401, 2.77534322311874,
1.89238991883419, -0.754119113657213, 1.17001087340064, 1.2114200925793,
1.88137320657763, 4.20005074396777, 3.52635149691509, 1.67095280749283,
1.49327663972698, 3.39392675080947, 3.11332639734731, 0.62248861090096,
0.585009686075761, 2.72916392427366, 3.53706584903083, 1.77530757569954,
3.76221545290843, 2.79621176073414, 0.775947213498458, 2.68223938284885,
-0.258726192161585, 4.86604740340207, 5.96079979701172, 1.26555704706698,
-0.0882692526330615, 4.70915888232724, 2.59483618835753, 10.2048532815143,
2.88227999180049, 5.06921808735233, 3.084006476342, 0.770180373352784,
3.56637689854303, -2.41487588667311, 7.39666458468866, 3.45976001463569,
9.51783501108646, 4.42652858669899, 0.870160707234557, 4.83217906046716,
0.197707105067245, -0.760900200717306, 3.87433870655239, 1.6701243803447,
3.00331605489487, 3.22302397245499, 1.23143716143578, 1.29399380154449,
2.5944641546285, 6.53426098971961, -1.57070040128929, 4.78183856288526,
3.99885111364055, 6.18929951182909), .Dim = c(29L, 4L), .Dimnames = list(
NULL, c("x", "y1", "y2", "y3")), index = structure(c(1167667200,
1167753600, 1167840000, 1167926400, 1168012800, 1168099200, 1168185600,
1168272000, 1168358400, 1168444800, 1168531200, 1168617600, 1168704000,
1168790400, 1168876800, 1168963200, 1169049600, 1169136000, 1169222400,
1169308800, 1169395200, 1169481600, 1169568000, 1169654400, 1169740800,
1169827200, 1169913600, 1.17e+09, 1170086400), tzone = "", tclass = c("POSIXct",
"POSIXt")), class = c("xts", "zoo"))
Best way is always debatable. But the following works without any other packages. I use seq to create the newly wanted dates, starting from the last timestamp of the xts object. Add 1 day (60*60*24 seconds) to that and end after 20 days.
Then it is just a question of merging and the NA's are created automatically.
library(xts)
# create additional sequence of dates.
new <- seq(from = end(my_xts) + 60*60*24,
to = end(my_xts) + 20*60*60*24,
by = "day")
my_xts_new <- merge(my_xts, new)
tail(my_xts_new)
x y1 y2 y3
2007-02-13 17:00:00 NA NA NA NA
2007-02-14 17:00:00 NA NA NA NA
2007-02-15 17:00:00 NA NA NA NA
2007-02-16 17:00:00 NA NA NA NA
2007-02-17 17:00:00 NA NA NA NA
2007-02-18 17:00:00 NA NA NA NA

Combining componenets of a list in r

I have a list that contains data by year. I want to combine these components into a single dataframe, which is matched by row. Example list:
List [[1]]
State Year X Y
23 1971 etc etc
47 1971 etc etc
List[[2]]
State Year X Y
13 1972 etc etc
23 1973 etc etc
47 1973 etc etc
etc....
List[[45]]
State Year X Y
1 2017 etc etc
2 2017 etc etc
3 2017 etc etc
1 2017 etc etc
23 2017 etc etc
47 2017 etc etc
I want the dataframe to look like (I know I will have to go through and remove some extra columns:
State 1971_X 1971_Y 1972_X 1972_Y....2018_X 2019_Y
1 NA NA NA NA etc etc
2 NA NA etc etc etc etc
3 etc ect etc etc etc etc
...
50 NA NA etc etc etc etc
I have tried the command Outcomewanted=do.call("cbind", examplelist) but get the message
"Error in data.frame(..., check.names = FALSE) :
arguments imply differing number of rows: 36, 40, 20, 42, 38, 26, 17, 31, 35, 23, 33, 13, 29, 28, 32, 34, 41, 37, 43, 39, 30, 14, 10, 4, 7"
It seems that the cbind.fill command could be an option but has been retired? Thanks for any help in advance.
You may use reshape after a do.call(rbind()) manoeuvre.
res <- reshape(do.call(rbind, lst), idvar="state", timevar="year", direction="wide")
res
# state x.1971 y.1971 x.1972 y.1972 x.1973 y.1973
# 1 23 1.3709584 0.3631284 NA NA -0.1061245 2.0184237
# 2 24 -0.5646982 0.6328626 NA NA 1.5115220 -0.0627141
# 3 13 NA NA 0.4042683 -0.09465904 NA NA
Data
lst <- list(structure(list(state = c(23, 24), year = c(1971, 1971),
x = c(1.37095844714667, -0.564698171396089), y = c(0.363128411337339,
0.63286260496104)), class = "data.frame", row.names = c(NA,
-2L)), structure(list(state = c(13, 23, 24), year = c(1972, 1973,
1973), x = c(0.404268323140999, -0.106124516091484, 1.51152199743894
), y = c(-0.0946590384130976, 2.01842371387704, -0.062714099052421
)), class = "data.frame", row.names = c(NA, -3L)))

How to select a row along with its next and previous rows with data.table

I have this data table:
> dput(data_DT)
structure(
list(Date = structure(
c(1512518400, 1512518400, 1512518400,
1512518400, 1512518400, 1512518400),
class = c("POSIXct", "POSIXt"),
tzone = "UTC"),
Time = structure(
c(1512573600, 1512573300, 1512573000,
1512572700, 1512572400, 1512572100),
class = c("POSIXct", "POSIXt"),
tzone = "UTC"),
High = c(46, 47, 49, 49, 49, 58),
High_lag1 = c(47, 49, 49, 49, 58, 60),
Low = c(45, 46, 46, 47, 43, 44),
Low_lag1 = c(46, 46, 47, 43, 44, 58),
tr = c(1, 3, 2, 6, 14),
tr_lag1 = c(1, 3, 2, 6, 14, 2)
),
row.names = c(NA, -6L),
class = c("data.table", "data.frame"))
I want to select some rows based on some conditions and then select either the previous or next row
This is the code I have so far
data_DT1 <- data_DT[Low < Low_lag1 & High < High_lag1 & tr > 13]
The code selects the row I want the row with the time 14:55 but I need in this case the row also with the time 15:00.
Also, in some circumstances I will be selecting a row and will need the previous row.
You can use the which() function that returns the index of your condition and then add or substract based on what you need :
# get row and next row :
data_DT[which(Low < Low_lag1 & High < High_lag1 & tr > 13)+c(0,1)]
# Date Time High High_lag1 Low Low_lag1 tr tr_lag1
# 1: 2017-12-06 2017-12-06 14:55:00 58 60 44 58 14 2
# 2: <NA> <NA> NA NA NA NA NA NA
# get previous row and row
data_DT[which(Low < Low_lag1 & High < High_lag1 & tr > 13)+c(-1,0)]
# Date Time High High_lag1 Low Low_lag1 tr tr_lag1
# 1: 2017-12-06 2017-12-06 15:00:00 49 58 43 44 6 14
# 2: 2017-12-06 2017-12-06 14:55:00 58 60 44 58 14 2

Resources