Selecting specific columns with readr cols_only [duplicate] - r

Can anyone please tell me how to read only the first 6 months (7 columns) for each year of the data below, for example by using read.table()?
Year Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
2009 -41 -27 -25 -31 -31 -39 -25 -15 -30 -27 -21 -25
2010 -41 -27 -25 -31 -31 -39 -25 -15 -30 -27 -21 -25
2011 -21 -27 -2 -6 -10 -32 -13 -12 -27 -30 -38 -29

Say the data are in file data.txt, you can use the colClasses argument of read.table() to skip columns. Here the data in the first 7 columns are "integer" and we set the remaining 6 columns to "NULL" indicating they should be skipped
> read.table("data.txt", colClasses = c(rep("integer", 7), rep("NULL", 6)),
+ header = TRUE)
Year Jan Feb Mar Apr May Jun
1 2009 -41 -27 -25 -31 -31 -39
2 2010 -41 -27 -25 -31 -31 -39
3 2011 -21 -27 -2 -6 -10 -32
Change "integer" to one of the accepted types as detailed in ?read.table depending on the real type of data.
data.txt looks like this:
$ cat data.txt
"Year" "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
2009 -41 -27 -25 -31 -31 -39 -25 -15 -30 -27 -21 -25
2010 -41 -27 -25 -31 -31 -39 -25 -15 -30 -27 -21 -25
2011 -21 -27 -2 -6 -10 -32 -13 -12 -27 -30 -38 -29
and was created by using
write.table(dat, file = "data.txt", row.names = FALSE)
where dat is
dat <- structure(list(Year = 2009:2011, Jan = c(-41L, -41L, -21L), Feb = c(-27L,
-27L, -27L), Mar = c(-25L, -25L, -2L), Apr = c(-31L, -31L, -6L
), May = c(-31L, -31L, -10L), Jun = c(-39L, -39L, -32L), Jul = c(-25L,
-25L, -13L), Aug = c(-15L, -15L, -12L), Sep = c(-30L, -30L, -27L
), Oct = c(-27L, -27L, -30L), Nov = c(-21L, -21L, -38L), Dec = c(-25L,
-25L, -29L)), .Names = c("Year", "Jan", "Feb", "Mar", "Apr",
"May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"), class = "data.frame",
row.names = c(NA, -3L))
If the number of columns is not known beforehand, the utility function count.fields will read through the file and count the number of fields in each line.
## returns a vector equal to the number of lines in the file
count.fields("data.txt", sep = "\t")
## returns the maximum to set colClasses
max(count.fields("data.txt", sep = "\t"))

To read a specific set of columns from a dataset you, there are several other options:
1) With freadfrom the data.table-package:
You can specify the desired columns with the select parameter from fread from the data.table package. You can specify the columns with a vector of column names or column numbers.
For the example dataset:
library(data.table)
dat <- fread("data.txt", select = c("Year","Jan","Feb","Mar","Apr","May","Jun"))
dat <- fread("data.txt", select = c(1:7))
Alternatively, you can use the drop parameter to indicate which columns should not be read:
dat <- fread("data.txt", drop = c("Jul","Aug","Sep","Oct","Nov","Dec"))
dat <- fread("data.txt", drop = c(8:13))
All result in:
> data
Year Jan Feb Mar Apr May Jun
1 2009 -41 -27 -25 -31 -31 -39
2 2010 -41 -27 -25 -31 -31 -39
3 2011 -21 -27 -2 -6 -10 -32
UPDATE: When you don't want fread to return a data.table, use the data.table = FALSE-parameter, e.g.: fread("data.txt", select = c(1:7), data.table = FALSE)
2) With read.csv.sql from the sqldf-package:
Another alternative is the read.csv.sql function from the sqldf package:
library(sqldf)
dat <- read.csv.sql("data.txt",
sql = "select Year,Jan,Feb,Mar,Apr,May,Jun from file",
sep = "\t")
3) With the read_*-functions from the readr-package:
library(readr)
dat <- read_table("data.txt",
col_types = cols_only(Year = 'i', Jan = 'i', Feb = 'i', Mar = 'i',
Apr = 'i', May = 'i', Jun = 'i'))
dat <- read_table("data.txt",
col_types = list(Jul = col_skip(), Aug = col_skip(), Sep = col_skip(),
Oct = col_skip(), Nov = col_skip(), Dec = col_skip()))
dat <- read_table("data.txt", col_types = 'iiiiiii______')
From the documentation an explanation for the used characters with col_types:
each character represents one column: c = character, i = integer, n = number, d = double, l = logical, D = date, T = date time, t = time, ? = guess, or _/- to skip the column

You could also use JDBC to achieve this. Let's create a sample csv file.
write.table(x=mtcars, file="mtcars.csv", sep=",", row.names=F, col.names=T) # create example csv file
Download and save the the CSV JDBC driver from this link: http://sourceforge.net/projects/csvjdbc/files/latest/download
> library(RJDBC)
> path.to.jdbc.driver <- "jdbc//csvjdbc-1.0-18.jar"
> drv <- JDBC("org.relique.jdbc.csv.CsvDriver", path.to.jdbc.driver)
> conn <- dbConnect(drv, sprintf("jdbc:relique:csv:%s", getwd()))
> head(dbGetQuery(conn, "select * from mtcars"), 3)
mpg cyl disp hp drat wt qsec vs am gear carb
1 21 6 160 110 3.9 2.62 16.46 0 1 4 4
2 21 6 160 110 3.9 2.875 17.02 0 1 4 4
3 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
> head(dbGetQuery(conn, "select mpg, gear from mtcars"), 3)
MPG GEAR
1 21 4
2 21 4
3 22.8 4

The vroom package provides a 'tidy' method of selecting / dropping columns by name during import. Docs: https://www.tidyverse.org/blog/2019/05/vroom-1-0-0/#column-selection
Column selection (col_select)
The vroom argument 'col_select' makes selecting columns to keep (or omit) more straightforward. The interface for col_select is the same as dplyr::select().
Select columns by name
data <- vroom("flights.tsv", col_select = c(year, flight, tailnum))
#> Observations: 336,776
#> Variables: 3
#> chr [1]: tailnum
#> dbl [2]: year, flight
#>
#> Call `spec()` for a copy-pastable column specification
#> Specify the column types with `col_types` to quiet this message
Drop columns by name
data <- vroom("flights.tsv", col_select = c(-dep_time, -air_time:-time_hour))
#> Observations: 336,776
#> Variables: 13
#> chr [4]: carrier, tailnum, origin, dest
#> dbl [9]: year, month, day, sched_dep_time, dep_delay, arr_time, sched_arr_time, arr...
#>
#> Call `spec()` for a copy-pastable column specification
#> Specify the column types with `col_types` to quiet this message
Use the selection helpers
data <- vroom("flights.tsv", col_select = ends_with("time"))
#> Observations: 336,776
#> Variables: 5
#> dbl [5]: dep_time, sched_dep_time, arr_time, sched_arr_time, air_time
#>
#> Call `spec()` for a copy-pastable column specification
#> Specify the column types with `col_types` to quiet this message
Or rename columns by name
data <- vroom("flights.tsv", col_select = list(plane = tailnum, everything()))
#> Observations: 336,776
#> Variables: 19
#> chr [ 4]: carrier, tailnum, origin, dest
#> dbl [14]: year, month, day, dep_time, sched_dep_time, dep_delay, arr_time, sched_arr...
#> dttm [ 1]: time_hour
#>
#> Call `spec()` for a copy-pastable column specification
#> Specify the column types with `col_types` to quiet this message
data
#> # A tibble: 336,776 x 19
#> plane year month day dep_time sched_dep_time dep_delay arr_time
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 N142… 2013 1 1 517 515 2 830
#> 2 N242… 2013 1 1 533 529 4 850
#> 3 N619… 2013 1 1 542 540 2 923
#> 4 N804… 2013 1 1 544 545 -1 1004
#> 5 N668… 2013 1 1 554 600 -6 812
#> 6 N394… 2013 1 1 554 558 -4 740
#> 7 N516… 2013 1 1 555 600 -5 913
#> 8 N829… 2013 1 1 557 600 -3 709
#> 9 N593… 2013 1 1 557 600 -3 838
#> 10 N3AL… 2013 1 1 558 600 -2 753
#> # … with 336,766 more rows, and 11 more variables: sched_arr_time <dbl>,
#> # arr_delay <dbl>, carrier <chr>, flight <dbl>, origin <chr>,
#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
#> # time_hour <dttm>

You do it like this:
df = read.table("file.txt", nrows=1, header=TRUE, sep="\t", stringsAsFactors=FALSE)
colClasses = as.list(apply(df, 2, class))
needCols = c("Year", "Jan", "Feb", "Mar", "Apr", "May", "Jun")
colClasses[!names(colClasses) %in% needCols] = list(NULL)
df = read.table("file.txt", header=TRUE, colClasses=colClasses, sep="\t", stringsAsFactors=FALSE)

Related

keep first row after calculating difference between rows with dplyr::lag

My question is similar to this OP and this OP, with a minor difference that seems to be overly complicated.
Example of my data:
ind_id wt date
1002 25 1987-07-27
1002 15 1988-05-05
2340 30 1987-03-18
2340 52 1989-08-15
I am calculating the difference between wt values after group_by(ind_id).
To do this:
df<-df %>%
group_by(ind_id) %>%
mutate(mass_diff=(wt-lag(wt))
This gives me this output:
ind_id wt date mass_diff
1002 15 1988-05-05 -10
2340 52 1989-08-15 22
But, the output I want should keep the first wt record, not the last.
Desired output:
ind_id wt date mass_diff
1002 25 1988-05-05 -10
2340 30 1989-08-15 22
Note that the wt column is the only one I'd like to have maintained from the first row. (Keep in mind that this example is overly simplified and I am actually working with 18 rows).
Any suggestions (using dplyr) would be appreciated!
A possible solution:
library(tidyverse)
df <- structure(list(ind_id = c(1002, 1002, 2340, 2340), wt = c(25,
15, 30, 52), date = structure(c(6416, 6699, 6285, 7166), class = "Date")), row.names = c(NA,
-4L), class = "data.frame")
df %>%
group_by(ind_id) %>%
mutate(mass_diff = (wt-lag(wt))) %>%
mutate(wt = first(wt)) %>%
slice_tail %>% ungroup
#> # A tibble: 2 × 4
#> ind_id wt date mass_diff
#> <dbl> <dbl> <date> <dbl>
#> 1 1002 25 1988-05-05 -10
#> 2 2340 30 1989-08-15 22

How do i convert my date values into year in r

another day with new complex faced
Below are the columns and rows that I have as input:
ID Age
123 23 Years 1 Month 2 Days
125 28 Years 9 Month 14 Days
126 28 years
127 34 YEAR
128 35 Years 8 Month 21 Days
129 38 Years 5 Month 25 Days
130 32.8
I need them as yearly calculated in new columns like:
ID Age Age_new
123 23 Years 1 Month 2 Days 23.1
125 28 Years 9 Month 14 Days 28.9
126 28 years 28
127 34 YEAR 34
128 35 Years 8 Month 21 Days 35.8
129 38 Years 5 Month 25 Days 38.5
130 32.8 32.8
I have tried the by stringr package but I get only first character string
which doesn't provide like the above.
Here's a gross approximation:
func <- function(x, ptn) {
out <- gsub(paste0(".*?\\b([0-9.]+)\\s*", ptn, ".*"), "\\1", x, ignore.case = TRUE)
ifelse(out == x, NA, out)
}
library(dplyr)
dat %>%
mutate(
data.frame(
lapply(c(yr = "year", mon = "month", day = "day"),
function(ptn) as.numeric(func(Age, ptn)))
),
yr = if_else(is.na(yr), suppressWarnings(as.numeric(Age)), yr),
across(c(yr, mon, day), ~ coalesce(., 0)), New_Age = yr + mon/12 + day/365
)
# ID Age yr mon day New_Age
# 1 123 23 Years 1 Month 2 Days 23.0 1 2 23.08881
# 2 125 28 Years 9 Month 14 Days 28.0 9 14 28.78836
# 3 126 28 years 28.0 0 0 28.00000
# 4 127 34 YEAR 34.0 0 0 34.00000
# 5 128 35 Years 8 Month 21 Days 35.0 8 21 35.72420
# 6 129 38 Years 5 Month 25 Days 38.0 5 25 38.48516
# 7 130 32.8 32.8 0 0 32.80000
(I offer no warranty on true accuracy.)
Data
dat <- structure(list(ID = c(123L, 125L, 126L, 127L, 128L, 129L, 130L), Age = c("23 Years 1 Month 2 Days", "28 Years 9 Month 14 Days", "28 years", "34 YEAR", "35 Years 8 Month 21 Days", "38 Years 5 Month 25 Days", "32.8")), class = "data.frame", row.names = c(NA, -7L))
This is my approach. I always try to avoid regex since it's too scary for me. If your data is exactly separated like your example, I think my code will work. I completely understand this is not the most efficient way. but heyy it works
dat %>%
mutate(space_counter = stringr::str_count(Age," ")) %>%
tidyr::separate(Age,into = paste0("tmp_col_",1:(max(.$space_counter)+1)),sep = " ") %>%
select(ID, tmp_col_1,tmp_col_3,tmp_col_5) %>%
setNames(c("ID","year","month","day")) %>%
mutate(across(everything(), ~replace_na(.x, 0))) %>%
mutate_if(is.character,as.integer) %>%
mutate(asdur = as.duration(years(year) + months(month) + days(day))) %>%
mutate(age_new = as.numeric(asdur)/3.154e+7)
output:

Complicated data formation

So I am trying to make a separate dataset that combines the yearly absence percentage and additionally binary variable of those with 10% or more total absence a year.
The absencePercentage should be calculated bycalculating total unauthorised and authorised absence divided by total possible sessions in all three terms.
Another thing is VioFlag. If the person has been flagged for Vio in at least one of the term, they should be flagged as VioFlagEver.
So the original data is like this:
ID PossibleSessions Term year unauthorisedAbsence authorisedAbsence VioFlag
0110 46 Sum 2014 0 1 0
0110 116 Win 2014 1 8 1
0110 56 Spr 2014 0 5 0
0110 44 Sum 2015 21 9 0
0110 120 Win 2015 2 2 0
0110 58 Spr 2015 10 1 0
So for ID 0110, he was absent for 15 sessions (0+1+1+8+0+5=15) out of possible 218 sessions (46+116+56=218). This means the absence percentage in 2014 for ID 0110 is 6.88%. He will not be the frequent absentee that year. But because in 2015, his absent rate was 20.27%, he will be a frequent absentee.
For ID 0110, He will be VioFlagEver for 2014 for not for 2015.
The new dataset I want to create is this.
ID year absencePercentage FrenquentAbsentee VioFlagEver
0110 2014 6.88 0 1
0110 2015 20.27 1 0
Please note that there are many IDs and year 2014 to 2018.
Thank you for your help!
You can try this:
library(tidyverse)
df %>% group_by(ID, year) %>%
summarize(absensepercentage = ((sum(unauthorisedAbsence) + sum(authorisedAbsence)) / sum(PossibleSessions))*100,
violflagever = if_else(sum(VioFlag) > 0, 1, 0),
frequentabsentee = if_else(absensepercentage > 10, 1, 0))
You can use tidyverse (dplyr) group_by and summarize to achieve this
library(tidyverse)
read.table(textConnection("ID PossibleSessions Term year unauthorisedAbsence authorisedAbsence VioFlag
0110 46 Sum 2014 0 1 0
0110 116 Win 2014 1 8 1
0110 56 Spr 2014 0 5 0
0110 44 Sum 2015 21 9 0
0110 120 Win 2015 2 2 0
0110 58 Spr 2015 10 1 0"),
header = T) %>%
as_tibble() -> df
df %>%
mutate(totalAbscence = unauthorisedAbsence+authorisedAbsence) %>%
group_by(ID, year) %>%
summarise(possibleAbscence = PossibleSessions %>% sum(),
totalAbscence = totalAbscence %>% sum(),
VioFlagEver = VioFlag %>% sum()) %>%
mutate(absencePercentage = (totalAbscence/possibleAbscence)*100,
FrenquentAbsentee = if_else(absencePercentage > 10, 1,0),
VioFlagEver = if_else(VioFlagEver > 0, 1, 0))
#> `summarise()` regrouping output by 'ID' (override with `.groups` argument)
#> # A tibble: 2 x 7
#> # Groups: ID [1]
#> ID year possibleAbscence totalAbscence VioFlagEver absencePercenta…
#> <int> <int> <int> <int> <dbl> <dbl>
#> 1 110 2014 218 15 1 6.88
#> 2 110 2015 222 45 0 20.3
#> # … with 1 more variable: FrenquentAbsentee <dbl>
Created on 2021-01-27 by the reprex package (v0.3.0)

Union All of SQL temporary tables created using dplyr?

How does one rbind or bind_rows temporary tables created in SQL (tested and failed in Postgres and SQLite) by dplyr?
E.g.
library(dplyr)
con <- DBI::dbConnect(RSQLite::SQLite(), path = ":memory:")
copy_to(con, nycflights13::flights, "flights",
temporary = FALSE,
indexes = list(
c("year", "month", "day"),
"carrier",
"tailnum",
"dest"
)
)
copy_to(con, nycflights13::flights, "flights2",
temporary = FALSE,
indexes = list(
c("year", "month", "day"),
"carrier",
"tailnum",
"dest"
)
)
flights_db <- tbl(con, "flights")
flights_db_2 <- tbl(con, "flights2")
Calling bind_rows gives the following error:
> bind_rows(flights_db, flights_db_2)
Error in bind_rows_(x, .id) :
Argument 1 must be a data frame or a named atomic vector, not a tbl_dbi/tbl_sql/tbl_lazy/tbl
As database holds unique records, here both the objects 'flights', 'flights2' are the same. Otherwise, we need
union(flights_db, flights_db_2)
The above will only create the dimensions as in 'flights_db' because both the objects are the same. If we need to create double the number of rows, then create a unique identifier
flights1 <- nycflights13::flights %>%
mutate(id= 1)
flights2 <- nycflights13::flights %>%
mutate(id = 2)
copy_to(con, flights1, "flights",
temporary = FALSE,
overwrite = TRUE,
indexes = list(
c("year", "month", "day"),
"carrier",
"tailnum",
"dest"
)
)
copy_to(con, flights2, "flights2",
temporary = FALSE,
overwrite = TRUE,
indexes = list(
c("year", "month", "day"),
"carrier",
"tailnum",
"dest"
)
)
flights_db <- tbl(con, "flights")
flights_db_2 <- tbl(con, "flights2")
Now we do the union
union(flights_db, flights_db_2) %>%
summarise(n = n())
# Source: lazy query [?? x 1]
# Database: sqlite 3.19.3 []
# n
# <int>
#1 673552
dim(nycflights13::flights)
#[1] 336776 19
To demonstrate the uniqueness, we can select a small subset of disjointed rows for both the objects and then do the union
copy_to(con, nycflights13::flights[1:20,], "flights",
temporary = FALSE,
overwrite = TRUE,
indexes = list(
c("year", "month", "day"),
"carrier",
"tailnum",
"dest"
)
)
copy_to(con, nycflights13::flights[21:30,], "flights2",
temporary = FALSE,
overwrite = TRUE,
indexes = list(
c("year", "month", "day"),
"carrier",
"tailnum",
"dest"
)
)
flights_db <- tbl(con, "flights")
flights_db_2 <- tbl(con, "flights2")
union(flights_db, flights_db_2) %>%
collect
# A tibble: 30 x 19
# year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance
# <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl>
# 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH 227 1400
# 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA IAH 227 1416
# 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA 160 1089
# 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN 183 1576
# 5 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD 150 719
# 6 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL 116 762
# 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR FLL 158 1065
# 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA IAD 53 229
# 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO 140 944
#10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD 138 733
# ... with 20 more rows, and 3 more variables: hour <dbl>, minute <dbl>, time_hour <dbl>
With thanks to Akrun for pointing me to the union family, it is possible to somewhat replicate bind_rows with:
Reduce(union_all, list(flights_db, flights_db, flights_db))
As noted in the comments to, and in Akrun's answer, union produces unique records in the result, and union_all is the equivalent to SQL's UNION ALL.

R drop columns when reading from CSV before the column type is identified [duplicate]

Can anyone please tell me how to read only the first 6 months (7 columns) for each year of the data below, for example by using read.table()?
Year Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
2009 -41 -27 -25 -31 -31 -39 -25 -15 -30 -27 -21 -25
2010 -41 -27 -25 -31 -31 -39 -25 -15 -30 -27 -21 -25
2011 -21 -27 -2 -6 -10 -32 -13 -12 -27 -30 -38 -29
Say the data are in file data.txt, you can use the colClasses argument of read.table() to skip columns. Here the data in the first 7 columns are "integer" and we set the remaining 6 columns to "NULL" indicating they should be skipped
> read.table("data.txt", colClasses = c(rep("integer", 7), rep("NULL", 6)),
+ header = TRUE)
Year Jan Feb Mar Apr May Jun
1 2009 -41 -27 -25 -31 -31 -39
2 2010 -41 -27 -25 -31 -31 -39
3 2011 -21 -27 -2 -6 -10 -32
Change "integer" to one of the accepted types as detailed in ?read.table depending on the real type of data.
data.txt looks like this:
$ cat data.txt
"Year" "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
2009 -41 -27 -25 -31 -31 -39 -25 -15 -30 -27 -21 -25
2010 -41 -27 -25 -31 -31 -39 -25 -15 -30 -27 -21 -25
2011 -21 -27 -2 -6 -10 -32 -13 -12 -27 -30 -38 -29
and was created by using
write.table(dat, file = "data.txt", row.names = FALSE)
where dat is
dat <- structure(list(Year = 2009:2011, Jan = c(-41L, -41L, -21L), Feb = c(-27L,
-27L, -27L), Mar = c(-25L, -25L, -2L), Apr = c(-31L, -31L, -6L
), May = c(-31L, -31L, -10L), Jun = c(-39L, -39L, -32L), Jul = c(-25L,
-25L, -13L), Aug = c(-15L, -15L, -12L), Sep = c(-30L, -30L, -27L
), Oct = c(-27L, -27L, -30L), Nov = c(-21L, -21L, -38L), Dec = c(-25L,
-25L, -29L)), .Names = c("Year", "Jan", "Feb", "Mar", "Apr",
"May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"), class = "data.frame",
row.names = c(NA, -3L))
If the number of columns is not known beforehand, the utility function count.fields will read through the file and count the number of fields in each line.
## returns a vector equal to the number of lines in the file
count.fields("data.txt", sep = "\t")
## returns the maximum to set colClasses
max(count.fields("data.txt", sep = "\t"))
To read a specific set of columns from a dataset you, there are several other options:
1) With freadfrom the data.table-package:
You can specify the desired columns with the select parameter from fread from the data.table package. You can specify the columns with a vector of column names or column numbers.
For the example dataset:
library(data.table)
dat <- fread("data.txt", select = c("Year","Jan","Feb","Mar","Apr","May","Jun"))
dat <- fread("data.txt", select = c(1:7))
Alternatively, you can use the drop parameter to indicate which columns should not be read:
dat <- fread("data.txt", drop = c("Jul","Aug","Sep","Oct","Nov","Dec"))
dat <- fread("data.txt", drop = c(8:13))
All result in:
> data
Year Jan Feb Mar Apr May Jun
1 2009 -41 -27 -25 -31 -31 -39
2 2010 -41 -27 -25 -31 -31 -39
3 2011 -21 -27 -2 -6 -10 -32
UPDATE: When you don't want fread to return a data.table, use the data.table = FALSE-parameter, e.g.: fread("data.txt", select = c(1:7), data.table = FALSE)
2) With read.csv.sql from the sqldf-package:
Another alternative is the read.csv.sql function from the sqldf package:
library(sqldf)
dat <- read.csv.sql("data.txt",
sql = "select Year,Jan,Feb,Mar,Apr,May,Jun from file",
sep = "\t")
3) With the read_*-functions from the readr-package:
library(readr)
dat <- read_table("data.txt",
col_types = cols_only(Year = 'i', Jan = 'i', Feb = 'i', Mar = 'i',
Apr = 'i', May = 'i', Jun = 'i'))
dat <- read_table("data.txt",
col_types = list(Jul = col_skip(), Aug = col_skip(), Sep = col_skip(),
Oct = col_skip(), Nov = col_skip(), Dec = col_skip()))
dat <- read_table("data.txt", col_types = 'iiiiiii______')
From the documentation an explanation for the used characters with col_types:
each character represents one column: c = character, i = integer, n = number, d = double, l = logical, D = date, T = date time, t = time, ? = guess, or _/- to skip the column
You could also use JDBC to achieve this. Let's create a sample csv file.
write.table(x=mtcars, file="mtcars.csv", sep=",", row.names=F, col.names=T) # create example csv file
Download and save the the CSV JDBC driver from this link: http://sourceforge.net/projects/csvjdbc/files/latest/download
> library(RJDBC)
> path.to.jdbc.driver <- "jdbc//csvjdbc-1.0-18.jar"
> drv <- JDBC("org.relique.jdbc.csv.CsvDriver", path.to.jdbc.driver)
> conn <- dbConnect(drv, sprintf("jdbc:relique:csv:%s", getwd()))
> head(dbGetQuery(conn, "select * from mtcars"), 3)
mpg cyl disp hp drat wt qsec vs am gear carb
1 21 6 160 110 3.9 2.62 16.46 0 1 4 4
2 21 6 160 110 3.9 2.875 17.02 0 1 4 4
3 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
> head(dbGetQuery(conn, "select mpg, gear from mtcars"), 3)
MPG GEAR
1 21 4
2 21 4
3 22.8 4
The vroom package provides a 'tidy' method of selecting / dropping columns by name during import. Docs: https://www.tidyverse.org/blog/2019/05/vroom-1-0-0/#column-selection
Column selection (col_select)
The vroom argument 'col_select' makes selecting columns to keep (or omit) more straightforward. The interface for col_select is the same as dplyr::select().
Select columns by name
data <- vroom("flights.tsv", col_select = c(year, flight, tailnum))
#> Observations: 336,776
#> Variables: 3
#> chr [1]: tailnum
#> dbl [2]: year, flight
#>
#> Call `spec()` for a copy-pastable column specification
#> Specify the column types with `col_types` to quiet this message
Drop columns by name
data <- vroom("flights.tsv", col_select = c(-dep_time, -air_time:-time_hour))
#> Observations: 336,776
#> Variables: 13
#> chr [4]: carrier, tailnum, origin, dest
#> dbl [9]: year, month, day, sched_dep_time, dep_delay, arr_time, sched_arr_time, arr...
#>
#> Call `spec()` for a copy-pastable column specification
#> Specify the column types with `col_types` to quiet this message
Use the selection helpers
data <- vroom("flights.tsv", col_select = ends_with("time"))
#> Observations: 336,776
#> Variables: 5
#> dbl [5]: dep_time, sched_dep_time, arr_time, sched_arr_time, air_time
#>
#> Call `spec()` for a copy-pastable column specification
#> Specify the column types with `col_types` to quiet this message
Or rename columns by name
data <- vroom("flights.tsv", col_select = list(plane = tailnum, everything()))
#> Observations: 336,776
#> Variables: 19
#> chr [ 4]: carrier, tailnum, origin, dest
#> dbl [14]: year, month, day, dep_time, sched_dep_time, dep_delay, arr_time, sched_arr...
#> dttm [ 1]: time_hour
#>
#> Call `spec()` for a copy-pastable column specification
#> Specify the column types with `col_types` to quiet this message
data
#> # A tibble: 336,776 x 19
#> plane year month day dep_time sched_dep_time dep_delay arr_time
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 N142… 2013 1 1 517 515 2 830
#> 2 N242… 2013 1 1 533 529 4 850
#> 3 N619… 2013 1 1 542 540 2 923
#> 4 N804… 2013 1 1 544 545 -1 1004
#> 5 N668… 2013 1 1 554 600 -6 812
#> 6 N394… 2013 1 1 554 558 -4 740
#> 7 N516… 2013 1 1 555 600 -5 913
#> 8 N829… 2013 1 1 557 600 -3 709
#> 9 N593… 2013 1 1 557 600 -3 838
#> 10 N3AL… 2013 1 1 558 600 -2 753
#> # … with 336,766 more rows, and 11 more variables: sched_arr_time <dbl>,
#> # arr_delay <dbl>, carrier <chr>, flight <dbl>, origin <chr>,
#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
#> # time_hour <dttm>
You do it like this:
df = read.table("file.txt", nrows=1, header=TRUE, sep="\t", stringsAsFactors=FALSE)
colClasses = as.list(apply(df, 2, class))
needCols = c("Year", "Jan", "Feb", "Mar", "Apr", "May", "Jun")
colClasses[!names(colClasses) %in% needCols] = list(NULL)
df = read.table("file.txt", header=TRUE, colClasses=colClasses, sep="\t", stringsAsFactors=FALSE)

Resources