Union All of SQL temporary tables created using dplyr? - r

How does one rbind or bind_rows temporary tables created in SQL (tested and failed in Postgres and SQLite) by dplyr?
E.g.
library(dplyr)
con <- DBI::dbConnect(RSQLite::SQLite(), path = ":memory:")
copy_to(con, nycflights13::flights, "flights",
temporary = FALSE,
indexes = list(
c("year", "month", "day"),
"carrier",
"tailnum",
"dest"
)
)
copy_to(con, nycflights13::flights, "flights2",
temporary = FALSE,
indexes = list(
c("year", "month", "day"),
"carrier",
"tailnum",
"dest"
)
)
flights_db <- tbl(con, "flights")
flights_db_2 <- tbl(con, "flights2")
Calling bind_rows gives the following error:
> bind_rows(flights_db, flights_db_2)
Error in bind_rows_(x, .id) :
Argument 1 must be a data frame or a named atomic vector, not a tbl_dbi/tbl_sql/tbl_lazy/tbl

As database holds unique records, here both the objects 'flights', 'flights2' are the same. Otherwise, we need
union(flights_db, flights_db_2)
The above will only create the dimensions as in 'flights_db' because both the objects are the same. If we need to create double the number of rows, then create a unique identifier
flights1 <- nycflights13::flights %>%
mutate(id= 1)
flights2 <- nycflights13::flights %>%
mutate(id = 2)
copy_to(con, flights1, "flights",
temporary = FALSE,
overwrite = TRUE,
indexes = list(
c("year", "month", "day"),
"carrier",
"tailnum",
"dest"
)
)
copy_to(con, flights2, "flights2",
temporary = FALSE,
overwrite = TRUE,
indexes = list(
c("year", "month", "day"),
"carrier",
"tailnum",
"dest"
)
)
flights_db <- tbl(con, "flights")
flights_db_2 <- tbl(con, "flights2")
Now we do the union
union(flights_db, flights_db_2) %>%
summarise(n = n())
# Source: lazy query [?? x 1]
# Database: sqlite 3.19.3 []
# n
# <int>
#1 673552
dim(nycflights13::flights)
#[1] 336776 19
To demonstrate the uniqueness, we can select a small subset of disjointed rows for both the objects and then do the union
copy_to(con, nycflights13::flights[1:20,], "flights",
temporary = FALSE,
overwrite = TRUE,
indexes = list(
c("year", "month", "day"),
"carrier",
"tailnum",
"dest"
)
)
copy_to(con, nycflights13::flights[21:30,], "flights2",
temporary = FALSE,
overwrite = TRUE,
indexes = list(
c("year", "month", "day"),
"carrier",
"tailnum",
"dest"
)
)
flights_db <- tbl(con, "flights")
flights_db_2 <- tbl(con, "flights2")
union(flights_db, flights_db_2) %>%
collect
# A tibble: 30 x 19
# year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance
# <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl>
# 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH 227 1400
# 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA IAH 227 1416
# 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA 160 1089
# 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN 183 1576
# 5 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD 150 719
# 6 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL 116 762
# 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR FLL 158 1065
# 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA IAD 53 229
# 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO 140 944
#10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD 138 733
# ... with 20 more rows, and 3 more variables: hour <dbl>, minute <dbl>, time_hour <dbl>

With thanks to Akrun for pointing me to the union family, it is possible to somewhat replicate bind_rows with:
Reduce(union_all, list(flights_db, flights_db, flights_db))
As noted in the comments to, and in Akrun's answer, union produces unique records in the result, and union_all is the equivalent to SQL's UNION ALL.

Related

Troubleshooting API coordinate request loop

I want to find the coordinates for a list of addresses.
I am using a data set that can be found here: "https://www.data.gv.at/katalog/dataset/kaufpreissammlung-liegenschaften-wien"
I've inputed this using the read_csv function as "data". I'm using the tidyverse and jsonlite libraries. The only relevant columns are "Straße" which is the street name and "ON" which is the street number. The city for all of these is Vienna, Austria.
I'm using OpenStreetMap and have formatted my address data like the format requires:
data$formatted_address <- paste(ifelse(is.na(data$ON), "", data$ON), "+", tolower(data$Straße), ",+vienna", sep = "")
This formats the adresses in this column as 1+milanweg,+vienna and 12+granergasse,+vienna. When I manually input this into the API format, it all works out and I get the coordinates: https://nominatim.openstreetmap.org/search?q=1+milanweg,+vienna&format=json&polygon=1&addressdetails=1
Since I now want to do this for my entire row, I am using jsonlite to create requests in R.
data$coordinates <- data.frame(lat = NA, lon = NA)
for (i in 1:nrow(data)) {
result <- try(readLines(paste0("https://nominatim.openstreetmap.org/search?q=",
URLencode(data$formatted_address[i]), "&format=json&polygon=1&addressdetails=1")),
silent = TRUE)
if (!inherits(result, "try-error")) {
if (length(result) > 0) {
result <- fromJSON(result)
if (length(result) > 0 && is.list(result[[1]])) {
data$coordinates[i, ] <- c(result[[1]]$lat, result[[1]]$lon)
}
}
}
}
This should theoretically create the exact same API request, however, the lat and lon columns are always empty.
How can I fix this script to create a list of coordinates for each address in the data set?
Data setup
library(tidyverse)
library(httr2)
df <- df %>%
mutate(
formatted_address = str_c(
if_else(is.na(on), "", on), "+", str_to_lower(strasse), "+vienna"
) %>% str_remove_all(" ")
)
# A tibble: 57,912 × 7
kg_code katastralgemeinde ez plz strasse on formatted_address
<dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
1 1617 Strebersdorf 1417 1210 Mühlweg 13 13+mühlweg+vienna
2 1607 Groß Jedlersdorf II 193 1210 Bahnsteggasse 4 4+bahnsteggasse+vienna
3 1209 Ober St.Veit 3570 1130 Jennerplatz 34/20 34/20+jennerplatz+vienna
4 1207 Lainz 405 1130 Sebastian-Brunner-Gasse 6 6+sebastian-brunner-gasse+vienna
5 1101 Favoriten 3831 1100 Laxenburger Straße 2C -2 D 2C-2D+laxenburgerstraße+vienna
6 1101 Favoriten 3827 1100 Laxenburger Straße 2 C 2C+laxenburgerstraße+vienna
7 1101 Favoriten 3836 1100 hinter Laxenburger Straße 2 C 2C+hinterlaxenburgerstraße+vienna
8 1201 Auhof 932 1130 Keplingergasse 10 10+keplingergasse+vienna
9 1213 Speising 135 1130 Speisinger Straße 29 29+speisingerstraße+vienna
10 1107 Simmering 2357 1100 BATTIGGASSE 44 44+battiggasse+vienna
# … with 57,902 more rows
# ℹ Use `print(n = ...)` to see more rows
API call and getting coordinates.
I gathered the display name matched by the API, and the lat & lon data.
get_coords <- function(address) {
cat("Getting coordinates", address, "\n")
str_c(
"https://nominatim.openstreetmap.org/search?q=",
address,
"&format=json&polygon=1&addressdetails=1"
) %>%
request() %>%
req_perform() %>%
resp_body_json(simplifyVector = TRUE) %>%
as_tibble() %>%
select(api_name = display_name,
lat, lon) %>%
slice(1)
}
df %>%
slice_sample(n = 10) %>%
mutate(coordinates = map(
formatted_address, possibly(get_coords, tibble(
api_name = NA_character_,
lat = NA_character_,
lon = NA_character_
))
)) %>%
unnest(coordinates)
# A tibble: 10 × 10
kg_code katastralgemeinde ez plz strasse on formatted_…¹ api_n…² lat lon
<dbl> <chr> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 1651 Aspern 3374 1220 ERLENWEG 8 8+erlenweg+… 8, Erl… 48.2… 16.4…
2 1613 Leopoldau 6617 1210 Oswald-Redlich-Straße 31 31+oswald-r… 31, Os… 48.2… 16.4…
3 1006 Landstraße 2425 1030 HAGENMÜLLERGASSE 45018 45018+hagen… Hagenm… 48.1… 16.4…
4 1101 Favoriten 541 1100 HERNDLGASSE 7 7+herndlgas… 7, Her… 48.1… 16.3…
5 1607 Groß Jedlersdorf II 221 1210 Prager Straße 70 70+pragerst… Prager… 48.2… 16.3…
6 1006 Landstraße 1184 1030 PAULUSGASSE 2 2+paulusgas… 2, Pau… 48.1… 16.3…
7 1654 Eßling 2712 1220 KAUDERSSTRASSE 61 61+kauderss… 61, Ka… 48.2… 16.5…
8 1401 Dornbach 2476 1170 Alszeile NA +alszeile+v… Alszei… 48.2… 16.2…
9 1654 Eßling 745 1220 Kirschenallee 19 19+kirschen… 19, Ki… 48.2… 16.5…
10 1204 Hadersdorf 3139 1140 MITTLERE STRASSE NA +mittlerest… Mittle… 48.2… 16.1…
# … with abbreviated variable names ¹​formatted_address, ²​api_name

How to calculate total and percentage while accounting for another column in R?

All,
Thanks in advance. I have this school dataset. Each category (in Category column) has a range number of students (e.g., from 30 to 60 students), so I need to calculate:
the total number of classrooms that fall in each category (from category 1 to category 4), and
the percentage of classrooms that fall in the category.
For example, how many classrooms (NumOfClassrooms column) fall in Category_4, and what's the percentage of those classrooms to the total classrooms? Here is an illustrative example for my question:
ID = 1:1050
District = rep(c("AR", "CO", "AL", "KS", "IN", "ME", "KY", "ME", "MN", "NJ"), times = c(80, 120, 100, 110, 120, 100, 100, 120, 100, 100))
schoolName = randomNames::randomNames(1050, ethnicity = 5 ,which.names = "last")
Grade = rep(c("First", "Second", "Third", "Fourth"), times = c(400, 300, 200, 150))
NumOfClassrooms = sample(1:6)
StudentNumber = sample(1:90, 5)
AverageNumOfStudents = StudentNumber/NumOfClassrooms
Category = ifelse(AverageNumOfStudents > 0 & AverageNumOfStudents < 10, "category_1",
ifelse(AverageNumOfStudents >=10 & AverageNumOfStudents < 30, "category_2",
ifelse(AverageNumOfStudents >=30 & AverageNumOfStudents <= 60, "category_3",
ifelse(AverageNumOfStudents > 60 , "category_4", "NA"))))
dat = data.frame(ID, schoolName, Grade, NumOfClassrooms, StudentNumber, AverageNumOfStudents, Category)
Finally, I need to divide the results based on the "District" column into separate excel files using the following code (it should work fine once I get the above two steps).
Final_Divide = Final_df %>%
dplyr::group_by(District) %>%
dplyr::ungroup()
list_data <- split(Final_Divide,
Final_Divide$District)
options(digits=3)
Map(openxlsx::write.xlsx, list_data, paste0(names(list_data), '.xlsx'))
Thank you very much in advance.
Setting a random seed before your code for reproducibility:
set.seed(42)
# Your code creating dat
Table1 <- xtabs(NumOfClassrooms~Category, dat)
Table1
# Category
# category_1 category_2 category_4
# 1925 1575 175
Table2 <- prop.table(Table1)
round(Table2, 4) # Proportions
# Category
# category_1 category_2 category_4
# 0.5238 0.4286 0.0476
round(Table2 * 100, 2) # Percent
# Category
# category_1 category_2 category_4
# 52.38 42.86 4.76
If we include District in dat:
dat <- data.frame(ID, District, schoolName, Grade, NumOfClassrooms, StudentNumber, AverageNumOfStudents, Category)
Table3 <- xtabs(NumOfClassrooms~District+Category, dat)
addmargins(Table3)
# Category
# District category_1 category_2 category_4 Sum
# AL 187 149 16 352
# AR 143 121 14 278
# CO 220 180 20 420
# IN 220 180 20 420
# KS 198 166 19 383
# KY 187 148 17 352
# ME 407 329 36 772
# MN 176 153 17 346
# NJ 187 149 16 352
# Sum 1925 1575 175 3675
For row percentages by District:
round(prop.table(Table3, 1) * 100, 2)
# Category
# District category_1 category_2 category_4
# AL 53.12 42.33 4.55
# AR 51.44 43.53 5.04
# CO 52.38 42.86 4.76
# IN 52.38 42.86 4.76
# KS 51.70 43.34 4.96
# KY 53.12 42.05 4.83
# ME 52.72 42.62 4.66
# MN 50.87 44.22 4.91
# NJ 53.12 42.33 4.55
Here's a possible solution using the tidyverse
dat %>%
mutate("Total Classrooms" = n()) %>%
group_by(Category) %>%
mutate("Number of Classrooms in Category" = n(),
"Category Percentage" = `Number of Classrooms in Category`/`Total Classrooms` * 100)
This will give us:
# Groups: Category [3]
ID District schoolName Grade NumOfClassrooms StudentNumber AverageNumOfStude~ Category `Total Classroom~ `Number of Classrooms in~ `Category Percent~
<int> <chr> <chr> <chr> <int> <int> <dbl> <chr> <int> <int> <dbl>
1 1 AR Svyatetskiy First 5 87 17.4 category~ 1050 525 50
2 2 AR Booco First 1 79 79 category~ 1050 175 16.7
3 3 AR Jones First 6 49 8.17 category~ 1050 350 33.3
4 4 AR Sapkin First 3 5 1.67 category~ 1050 350 33.3
5 5 AR Fosse First 2 35 17.5 category~ 1050 525 50
6 6 AR Vanwagenen First 4 87 21.8 category~ 1050 525 50
7 7 AR Orth First 5 79 17.4 category~ 1050 525 50
8 8 AR Moline First 1 49 79 category~ 1050 175 16.7
9 9 AR Bradford First 6 5 8.17 category~ 1050 350 33.3
10 10 AR Wollman First 3 35 1.67 category~ 1050 350 33.3
# ... with 1,040 more rows
If you need a separate table of just the category/# classrooms/percentage data:
dat %>%
mutate("Total Classrooms" = n()) %>%
group_by(Category) %>%
mutate("Number of Classrooms in Category" = n(),
"Category Percentage" = `Number of Classrooms in Category`/`Total Classrooms` * 100) %>%
select(Category, "Number of Classrooms in Category", "Category Percentage") %>%
unique()
This gives us:
# A tibble: 3 x 3
# Groups: Category [3]
Category `Number of Classrooms in Category` `Category Percentage`
<chr> <int> <dbl>
1 category_2 525 50
2 category_4 175 16.7
3 category_1 350 33.3
Note that in your post, this code is a bit redundant:
Final_Divide = Final_df %>%
dplyr::group_by(District) %>%
dplyr::ungroup()
If you group and then immediately ungroup, you're actually just doing this:
Final_Divide <- Final_df
You could also consider adding split(.$District) to transform your data into a list all in one chunk of code:
dat %>%
mutate("Total Classrooms" = n()) %>%
group_by(Category) %>%
mutate("Number of Classrooms in Category" = n(),
"Category Percentage" = `Number of Classrooms in Category`/`Total Classrooms` * 100) %>%
split(.$District)

Sum variables by unique variable names, for different metrics - requires finding unique names before/after a prefix in variable name

Is there is a way to sum variables (e.g. sales and units) for all unique variable names (brands like coke and pepsi) within a dataframe.
To help, here is some example data.
set.seed(123)
period <- seq(as.Date('2021/01/01'), as.Date('2021/01/07'), by="day")
Coke_Regular_Units <- sample(1000:2000, 7, replace = TRUE)
Coke_Diet_Units <- sample(1000:2000, 7, replace = TRUE)
Coke_Regular_Sales <- sample(500:1000,7, replace = TRUE)
Coke_Diet_Sales <- sample(500:1000, 7, replace = TRUE)
Pepsi_Regular_Units <- sample(1000:2000, 7, replace = TRUE)
Pepsi_Diet_Units <- sample(1000:2000, 7, replace = TRUE)
Pepsi_Regular_Sales <- sample(500:1000, 7, replace = TRUE)
Pepsi_Diet_Sales <- sample(500:1000, 7, replace = TRUE)
df <- data.frame(Coke_Regular_Units, Coke_Diet_Units, Coke_Regular_Sales, Coke_Diet_Sales,
Pepsi_Regular_Units, Pepsi_Diet_Units, Pepsi_Regular_Sales, Pepsi_Diet_Sales)
> head(df)
period Coke_Regular_Units Coke_Diet_Units Coke_Regular_Sales Coke_Diet_Sales Pepsi_Regular_Units
1 2021-01-01 1414 1117 589 847 1425
2 2021-01-02 1462 1298 590 636 1648
3 2021-01-03 1178 1228 755 976 1765
4 2021-01-04 1525 1243 696 854 1210
5 2021-01-05 1194 1013 998 827 1931
6 2021-01-06 1937 1373 590 525 1589
Pepsi_Diet_Units Pepsi_Regular_Sales Pepsi_Diet_Sales
1 1554 608 943
2 1870 762 808
3 1372 892 634
4 1843 924 808
5 1142 829 910
6 1543 522 723
I like a code to automatically calculate Coke_Sales, Coke_Units, Pepsi_Sales, Pepsi_Units, Regular_Sales and Diet_Units.
I am currently doing it like this for each variable
library(dplyr)
df$Coke_Sales <- rowSums(Filter(is.numeric, select(df, (matches("Coke") & matches("Sales")))))
df$Coke_Units <- rowSums(Filter(is.numeric, select(df, (matches("Coke") & matches("Units")))))
This is ok for a small number of variables, but I need to do this for 100s of variables. Is there any function that enables this? It would need to automatically find the unique variable names like Coke, Pepsi, Diet and Regular. The metric is the last part of the variable name, so doesn't necessarily need to auto-find this but would be great. If it makes it any easier, it would be ok to specify the metrics as there are only 3 metrics at most, but there are hundreds of brands.
If it cant be automated, is there a way it can be simplified, where I specify the variables required. Not perfect but still an improvement. For example including these lines of code to specify variables to sum and metrics required.
VarsToSum <- c("Coke", "Pepsi", "Diet", "Regular")
Metrics <- c("Sales", "Units")
If it can't be accomplished that way either, maybe I need to break into smaller steps, any tips would be great. Trying to think how to do it, should I try to find unique name before a prefix "_", then calculate "Sales" and "Units" for those unique names. Would this be the best way to do it? Or should I reshape the data? Are there any other routes to get there?
Any help, or directions how to achieve this would be greatly appreciated. Thanks
here is a data.tableapproach...
library( data.table )
setDT(df) #make it a data.table
#melt to long
ans <- melt( df, id.vars = "period", variable.factor = FALSE )
#split variable to 3 new columns
ans[, c("brand", "type", "what") := tstrsplit( variable, "_" ) ]
# > head(ans)
# period variable value brand type what
# 1: 2021-01-01 Coke_Regular_Units 1414 Coke Regular Units
# 2: 2021-01-02 Coke_Regular_Units 1462 Coke Regular Units
# 3: 2021-01-03 Coke_Regular_Units 1178 Coke Regular Units
# 4: 2021-01-04 Coke_Regular_Units 1525 Coke Regular Units
# 5: 2021-01-05 Coke_Regular_Units 1194 Coke Regular Units
# 6: 2021-01-06 Coke_Regular_Units 1937 Coke Regular Units
#summarise however you like
ans[, .(total = sum(value) ), by = .(brand, type, what)]
# brand type what total
# 1: Coke Regular Units 10527
# 2: Coke Diet Units 8936
# 3: Coke Regular Sales 5158
# 4: Coke Diet Sales 5171
# 5: Pepsi Regular Units 11160
# 6: Pepsi Diet Units 10813
# 7: Pepsi Regular Sales 5447
# 8: Pepsi Diet Sales 5491
Using outer for pasteing the syllables and grep.
sapply(outer(c("Coke", "Pepsi"), c("Sales", "Units"), paste, sep=".*"), function(x)
rowSums(df[grep(x, names(df))]))
# Coke.*Sales Pepsi.*Sales Coke.*Units Pepsi.*Units
# [1,] 1436 1551 2531 2979
# [2,] 1226 1570 2760 3518
# [3,] 1731 1526 2406 3137
# [4,] 1550 1732 2768 3053
# [5,] 1825 1739 2207 3073
# [6,] 1115 1245 3310 3132
# [7,] 1446 1575 3481 3081
Here's a solution similar in spirit to that of #Wimpel, but with the tidyverse :
library(tidyverse)
summary_df <-
df %>%
pivot_longer(cols = ends_with("Sales") | ends_with("Units"),
names_to = c("brand", "type", ".value"),
names_pattern = "(.*)_(.*)_(.*)") %>%
group_by(brand) %>%
summarize(Sales = sum(Sales),
Units = sum(Units)) %>%
pivot_wider(names_from = "brand",
values_from = c("Sales", "Units"),
names_glue = "{brand}_{.value}")
summary_df
# # A tibble: 1 x 4
# Coke_Sales Pepsi_Sales Coke_Units Pepsi_Units
# <int> <int> <int> <int>
# 1 10329 10938 19463 21973

Selecting specific columns with readr cols_only [duplicate]

Can anyone please tell me how to read only the first 6 months (7 columns) for each year of the data below, for example by using read.table()?
Year Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
2009 -41 -27 -25 -31 -31 -39 -25 -15 -30 -27 -21 -25
2010 -41 -27 -25 -31 -31 -39 -25 -15 -30 -27 -21 -25
2011 -21 -27 -2 -6 -10 -32 -13 -12 -27 -30 -38 -29
Say the data are in file data.txt, you can use the colClasses argument of read.table() to skip columns. Here the data in the first 7 columns are "integer" and we set the remaining 6 columns to "NULL" indicating they should be skipped
> read.table("data.txt", colClasses = c(rep("integer", 7), rep("NULL", 6)),
+ header = TRUE)
Year Jan Feb Mar Apr May Jun
1 2009 -41 -27 -25 -31 -31 -39
2 2010 -41 -27 -25 -31 -31 -39
3 2011 -21 -27 -2 -6 -10 -32
Change "integer" to one of the accepted types as detailed in ?read.table depending on the real type of data.
data.txt looks like this:
$ cat data.txt
"Year" "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
2009 -41 -27 -25 -31 -31 -39 -25 -15 -30 -27 -21 -25
2010 -41 -27 -25 -31 -31 -39 -25 -15 -30 -27 -21 -25
2011 -21 -27 -2 -6 -10 -32 -13 -12 -27 -30 -38 -29
and was created by using
write.table(dat, file = "data.txt", row.names = FALSE)
where dat is
dat <- structure(list(Year = 2009:2011, Jan = c(-41L, -41L, -21L), Feb = c(-27L,
-27L, -27L), Mar = c(-25L, -25L, -2L), Apr = c(-31L, -31L, -6L
), May = c(-31L, -31L, -10L), Jun = c(-39L, -39L, -32L), Jul = c(-25L,
-25L, -13L), Aug = c(-15L, -15L, -12L), Sep = c(-30L, -30L, -27L
), Oct = c(-27L, -27L, -30L), Nov = c(-21L, -21L, -38L), Dec = c(-25L,
-25L, -29L)), .Names = c("Year", "Jan", "Feb", "Mar", "Apr",
"May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"), class = "data.frame",
row.names = c(NA, -3L))
If the number of columns is not known beforehand, the utility function count.fields will read through the file and count the number of fields in each line.
## returns a vector equal to the number of lines in the file
count.fields("data.txt", sep = "\t")
## returns the maximum to set colClasses
max(count.fields("data.txt", sep = "\t"))
To read a specific set of columns from a dataset you, there are several other options:
1) With freadfrom the data.table-package:
You can specify the desired columns with the select parameter from fread from the data.table package. You can specify the columns with a vector of column names or column numbers.
For the example dataset:
library(data.table)
dat <- fread("data.txt", select = c("Year","Jan","Feb","Mar","Apr","May","Jun"))
dat <- fread("data.txt", select = c(1:7))
Alternatively, you can use the drop parameter to indicate which columns should not be read:
dat <- fread("data.txt", drop = c("Jul","Aug","Sep","Oct","Nov","Dec"))
dat <- fread("data.txt", drop = c(8:13))
All result in:
> data
Year Jan Feb Mar Apr May Jun
1 2009 -41 -27 -25 -31 -31 -39
2 2010 -41 -27 -25 -31 -31 -39
3 2011 -21 -27 -2 -6 -10 -32
UPDATE: When you don't want fread to return a data.table, use the data.table = FALSE-parameter, e.g.: fread("data.txt", select = c(1:7), data.table = FALSE)
2) With read.csv.sql from the sqldf-package:
Another alternative is the read.csv.sql function from the sqldf package:
library(sqldf)
dat <- read.csv.sql("data.txt",
sql = "select Year,Jan,Feb,Mar,Apr,May,Jun from file",
sep = "\t")
3) With the read_*-functions from the readr-package:
library(readr)
dat <- read_table("data.txt",
col_types = cols_only(Year = 'i', Jan = 'i', Feb = 'i', Mar = 'i',
Apr = 'i', May = 'i', Jun = 'i'))
dat <- read_table("data.txt",
col_types = list(Jul = col_skip(), Aug = col_skip(), Sep = col_skip(),
Oct = col_skip(), Nov = col_skip(), Dec = col_skip()))
dat <- read_table("data.txt", col_types = 'iiiiiii______')
From the documentation an explanation for the used characters with col_types:
each character represents one column: c = character, i = integer, n = number, d = double, l = logical, D = date, T = date time, t = time, ? = guess, or _/- to skip the column
You could also use JDBC to achieve this. Let's create a sample csv file.
write.table(x=mtcars, file="mtcars.csv", sep=",", row.names=F, col.names=T) # create example csv file
Download and save the the CSV JDBC driver from this link: http://sourceforge.net/projects/csvjdbc/files/latest/download
> library(RJDBC)
> path.to.jdbc.driver <- "jdbc//csvjdbc-1.0-18.jar"
> drv <- JDBC("org.relique.jdbc.csv.CsvDriver", path.to.jdbc.driver)
> conn <- dbConnect(drv, sprintf("jdbc:relique:csv:%s", getwd()))
> head(dbGetQuery(conn, "select * from mtcars"), 3)
mpg cyl disp hp drat wt qsec vs am gear carb
1 21 6 160 110 3.9 2.62 16.46 0 1 4 4
2 21 6 160 110 3.9 2.875 17.02 0 1 4 4
3 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
> head(dbGetQuery(conn, "select mpg, gear from mtcars"), 3)
MPG GEAR
1 21 4
2 21 4
3 22.8 4
The vroom package provides a 'tidy' method of selecting / dropping columns by name during import. Docs: https://www.tidyverse.org/blog/2019/05/vroom-1-0-0/#column-selection
Column selection (col_select)
The vroom argument 'col_select' makes selecting columns to keep (or omit) more straightforward. The interface for col_select is the same as dplyr::select().
Select columns by name
data <- vroom("flights.tsv", col_select = c(year, flight, tailnum))
#> Observations: 336,776
#> Variables: 3
#> chr [1]: tailnum
#> dbl [2]: year, flight
#>
#> Call `spec()` for a copy-pastable column specification
#> Specify the column types with `col_types` to quiet this message
Drop columns by name
data <- vroom("flights.tsv", col_select = c(-dep_time, -air_time:-time_hour))
#> Observations: 336,776
#> Variables: 13
#> chr [4]: carrier, tailnum, origin, dest
#> dbl [9]: year, month, day, sched_dep_time, dep_delay, arr_time, sched_arr_time, arr...
#>
#> Call `spec()` for a copy-pastable column specification
#> Specify the column types with `col_types` to quiet this message
Use the selection helpers
data <- vroom("flights.tsv", col_select = ends_with("time"))
#> Observations: 336,776
#> Variables: 5
#> dbl [5]: dep_time, sched_dep_time, arr_time, sched_arr_time, air_time
#>
#> Call `spec()` for a copy-pastable column specification
#> Specify the column types with `col_types` to quiet this message
Or rename columns by name
data <- vroom("flights.tsv", col_select = list(plane = tailnum, everything()))
#> Observations: 336,776
#> Variables: 19
#> chr [ 4]: carrier, tailnum, origin, dest
#> dbl [14]: year, month, day, dep_time, sched_dep_time, dep_delay, arr_time, sched_arr...
#> dttm [ 1]: time_hour
#>
#> Call `spec()` for a copy-pastable column specification
#> Specify the column types with `col_types` to quiet this message
data
#> # A tibble: 336,776 x 19
#> plane year month day dep_time sched_dep_time dep_delay arr_time
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 N142… 2013 1 1 517 515 2 830
#> 2 N242… 2013 1 1 533 529 4 850
#> 3 N619… 2013 1 1 542 540 2 923
#> 4 N804… 2013 1 1 544 545 -1 1004
#> 5 N668… 2013 1 1 554 600 -6 812
#> 6 N394… 2013 1 1 554 558 -4 740
#> 7 N516… 2013 1 1 555 600 -5 913
#> 8 N829… 2013 1 1 557 600 -3 709
#> 9 N593… 2013 1 1 557 600 -3 838
#> 10 N3AL… 2013 1 1 558 600 -2 753
#> # … with 336,766 more rows, and 11 more variables: sched_arr_time <dbl>,
#> # arr_delay <dbl>, carrier <chr>, flight <dbl>, origin <chr>,
#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
#> # time_hour <dttm>
You do it like this:
df = read.table("file.txt", nrows=1, header=TRUE, sep="\t", stringsAsFactors=FALSE)
colClasses = as.list(apply(df, 2, class))
needCols = c("Year", "Jan", "Feb", "Mar", "Apr", "May", "Jun")
colClasses[!names(colClasses) %in% needCols] = list(NULL)
df = read.table("file.txt", header=TRUE, colClasses=colClasses, sep="\t", stringsAsFactors=FALSE)

R drop columns when reading from CSV before the column type is identified [duplicate]

Can anyone please tell me how to read only the first 6 months (7 columns) for each year of the data below, for example by using read.table()?
Year Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
2009 -41 -27 -25 -31 -31 -39 -25 -15 -30 -27 -21 -25
2010 -41 -27 -25 -31 -31 -39 -25 -15 -30 -27 -21 -25
2011 -21 -27 -2 -6 -10 -32 -13 -12 -27 -30 -38 -29
Say the data are in file data.txt, you can use the colClasses argument of read.table() to skip columns. Here the data in the first 7 columns are "integer" and we set the remaining 6 columns to "NULL" indicating they should be skipped
> read.table("data.txt", colClasses = c(rep("integer", 7), rep("NULL", 6)),
+ header = TRUE)
Year Jan Feb Mar Apr May Jun
1 2009 -41 -27 -25 -31 -31 -39
2 2010 -41 -27 -25 -31 -31 -39
3 2011 -21 -27 -2 -6 -10 -32
Change "integer" to one of the accepted types as detailed in ?read.table depending on the real type of data.
data.txt looks like this:
$ cat data.txt
"Year" "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
2009 -41 -27 -25 -31 -31 -39 -25 -15 -30 -27 -21 -25
2010 -41 -27 -25 -31 -31 -39 -25 -15 -30 -27 -21 -25
2011 -21 -27 -2 -6 -10 -32 -13 -12 -27 -30 -38 -29
and was created by using
write.table(dat, file = "data.txt", row.names = FALSE)
where dat is
dat <- structure(list(Year = 2009:2011, Jan = c(-41L, -41L, -21L), Feb = c(-27L,
-27L, -27L), Mar = c(-25L, -25L, -2L), Apr = c(-31L, -31L, -6L
), May = c(-31L, -31L, -10L), Jun = c(-39L, -39L, -32L), Jul = c(-25L,
-25L, -13L), Aug = c(-15L, -15L, -12L), Sep = c(-30L, -30L, -27L
), Oct = c(-27L, -27L, -30L), Nov = c(-21L, -21L, -38L), Dec = c(-25L,
-25L, -29L)), .Names = c("Year", "Jan", "Feb", "Mar", "Apr",
"May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"), class = "data.frame",
row.names = c(NA, -3L))
If the number of columns is not known beforehand, the utility function count.fields will read through the file and count the number of fields in each line.
## returns a vector equal to the number of lines in the file
count.fields("data.txt", sep = "\t")
## returns the maximum to set colClasses
max(count.fields("data.txt", sep = "\t"))
To read a specific set of columns from a dataset you, there are several other options:
1) With freadfrom the data.table-package:
You can specify the desired columns with the select parameter from fread from the data.table package. You can specify the columns with a vector of column names or column numbers.
For the example dataset:
library(data.table)
dat <- fread("data.txt", select = c("Year","Jan","Feb","Mar","Apr","May","Jun"))
dat <- fread("data.txt", select = c(1:7))
Alternatively, you can use the drop parameter to indicate which columns should not be read:
dat <- fread("data.txt", drop = c("Jul","Aug","Sep","Oct","Nov","Dec"))
dat <- fread("data.txt", drop = c(8:13))
All result in:
> data
Year Jan Feb Mar Apr May Jun
1 2009 -41 -27 -25 -31 -31 -39
2 2010 -41 -27 -25 -31 -31 -39
3 2011 -21 -27 -2 -6 -10 -32
UPDATE: When you don't want fread to return a data.table, use the data.table = FALSE-parameter, e.g.: fread("data.txt", select = c(1:7), data.table = FALSE)
2) With read.csv.sql from the sqldf-package:
Another alternative is the read.csv.sql function from the sqldf package:
library(sqldf)
dat <- read.csv.sql("data.txt",
sql = "select Year,Jan,Feb,Mar,Apr,May,Jun from file",
sep = "\t")
3) With the read_*-functions from the readr-package:
library(readr)
dat <- read_table("data.txt",
col_types = cols_only(Year = 'i', Jan = 'i', Feb = 'i', Mar = 'i',
Apr = 'i', May = 'i', Jun = 'i'))
dat <- read_table("data.txt",
col_types = list(Jul = col_skip(), Aug = col_skip(), Sep = col_skip(),
Oct = col_skip(), Nov = col_skip(), Dec = col_skip()))
dat <- read_table("data.txt", col_types = 'iiiiiii______')
From the documentation an explanation for the used characters with col_types:
each character represents one column: c = character, i = integer, n = number, d = double, l = logical, D = date, T = date time, t = time, ? = guess, or _/- to skip the column
You could also use JDBC to achieve this. Let's create a sample csv file.
write.table(x=mtcars, file="mtcars.csv", sep=",", row.names=F, col.names=T) # create example csv file
Download and save the the CSV JDBC driver from this link: http://sourceforge.net/projects/csvjdbc/files/latest/download
> library(RJDBC)
> path.to.jdbc.driver <- "jdbc//csvjdbc-1.0-18.jar"
> drv <- JDBC("org.relique.jdbc.csv.CsvDriver", path.to.jdbc.driver)
> conn <- dbConnect(drv, sprintf("jdbc:relique:csv:%s", getwd()))
> head(dbGetQuery(conn, "select * from mtcars"), 3)
mpg cyl disp hp drat wt qsec vs am gear carb
1 21 6 160 110 3.9 2.62 16.46 0 1 4 4
2 21 6 160 110 3.9 2.875 17.02 0 1 4 4
3 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
> head(dbGetQuery(conn, "select mpg, gear from mtcars"), 3)
MPG GEAR
1 21 4
2 21 4
3 22.8 4
The vroom package provides a 'tidy' method of selecting / dropping columns by name during import. Docs: https://www.tidyverse.org/blog/2019/05/vroom-1-0-0/#column-selection
Column selection (col_select)
The vroom argument 'col_select' makes selecting columns to keep (or omit) more straightforward. The interface for col_select is the same as dplyr::select().
Select columns by name
data <- vroom("flights.tsv", col_select = c(year, flight, tailnum))
#> Observations: 336,776
#> Variables: 3
#> chr [1]: tailnum
#> dbl [2]: year, flight
#>
#> Call `spec()` for a copy-pastable column specification
#> Specify the column types with `col_types` to quiet this message
Drop columns by name
data <- vroom("flights.tsv", col_select = c(-dep_time, -air_time:-time_hour))
#> Observations: 336,776
#> Variables: 13
#> chr [4]: carrier, tailnum, origin, dest
#> dbl [9]: year, month, day, sched_dep_time, dep_delay, arr_time, sched_arr_time, arr...
#>
#> Call `spec()` for a copy-pastable column specification
#> Specify the column types with `col_types` to quiet this message
Use the selection helpers
data <- vroom("flights.tsv", col_select = ends_with("time"))
#> Observations: 336,776
#> Variables: 5
#> dbl [5]: dep_time, sched_dep_time, arr_time, sched_arr_time, air_time
#>
#> Call `spec()` for a copy-pastable column specification
#> Specify the column types with `col_types` to quiet this message
Or rename columns by name
data <- vroom("flights.tsv", col_select = list(plane = tailnum, everything()))
#> Observations: 336,776
#> Variables: 19
#> chr [ 4]: carrier, tailnum, origin, dest
#> dbl [14]: year, month, day, dep_time, sched_dep_time, dep_delay, arr_time, sched_arr...
#> dttm [ 1]: time_hour
#>
#> Call `spec()` for a copy-pastable column specification
#> Specify the column types with `col_types` to quiet this message
data
#> # A tibble: 336,776 x 19
#> plane year month day dep_time sched_dep_time dep_delay arr_time
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 N142… 2013 1 1 517 515 2 830
#> 2 N242… 2013 1 1 533 529 4 850
#> 3 N619… 2013 1 1 542 540 2 923
#> 4 N804… 2013 1 1 544 545 -1 1004
#> 5 N668… 2013 1 1 554 600 -6 812
#> 6 N394… 2013 1 1 554 558 -4 740
#> 7 N516… 2013 1 1 555 600 -5 913
#> 8 N829… 2013 1 1 557 600 -3 709
#> 9 N593… 2013 1 1 557 600 -3 838
#> 10 N3AL… 2013 1 1 558 600 -2 753
#> # … with 336,766 more rows, and 11 more variables: sched_arr_time <dbl>,
#> # arr_delay <dbl>, carrier <chr>, flight <dbl>, origin <chr>,
#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
#> # time_hour <dttm>
You do it like this:
df = read.table("file.txt", nrows=1, header=TRUE, sep="\t", stringsAsFactors=FALSE)
colClasses = as.list(apply(df, 2, class))
needCols = c("Year", "Jan", "Feb", "Mar", "Apr", "May", "Jun")
colClasses[!names(colClasses) %in% needCols] = list(NULL)
df = read.table("file.txt", header=TRUE, colClasses=colClasses, sep="\t", stringsAsFactors=FALSE)

Resources