Import CSV multiple ranges and headers - r

I am trying to import the following file with two repeated sections of data to extract. The first set begins with unused header (line 5) and real header beginning with "ES" line 5). The next section of data begins with unused header (line 13) and real header beginning with "LU" (line 14) and more variable names. There are many of these files and in each there are different numbers of EU and LS sections of varying length. I need to extract the LS and EU data to separate dataframes. Unfortunately, the files are "as is" off of a sensor array and I can't change that and would prefer not to do all this in excel but may have to. In the real files there may be hundreds of these rows for each EU and LS set.
I have tried to adapt the following code to index EU sections and would then extract that and clean it up and then would have done the same on the LS sections but I did not even get this to work. Part of the reason is that EU was in both header rows. I did see code using perl scripts but have never used that language.
lns = readLines("lake1.txt")
idx = grepl("EU", lns)
df = read.table(text=lns[!idx])
wd = diff(c(which(idx), length(idx) + 1)) - 1
df$label = rep(lns[idx], wd)
I was not sure the best way to add a CSV file example but here it is...
Garbage Text 1,,,,,,,,
Garbage Text 2,,,,,,,,
Garbage Text 3,,,,,,,,
,,,,,,,,
INTTIME ('sec'),SAMPLE ('sec'),ES_DARK ('uW/cm^2/nm'),ES_DARK ('uW/cm^2/nm'),ES_DARK ('uW/cm^2/nm'),CHECK (''),DATETAG (NONE),TIMETAG2 (NONE),POSFRAME (NONE)
ES,DELAY,344.83,348.23,351.62,SUM,NONE,NONE,COUNTS
0.032,0,0.35441789,-0.00060208,0.10290995,87,2017015,10:42:39,1
0.032,0,-0.36023974,-0.22242269,-0.09639,109,2017015,10:42:40,10
0.032,0,0.07552711,0.01524224,-0.16756855,91,2017015,10:42:48,41
,,,,,,,,11304
,,,,,,,,11312
,,,,,,,,
INTTIME ('sec'),SAMPLE ('sec'),LU ('uW/cm^2/nm/sr'),LU ('uW/cm^2/nm/sr'),LU ('uW/cm^2/nm/sr'),CHECK (''),DATETAG (NONE),TIMETAG2 (NONE),POSFRAME (NONE)
LU,DELAY,344.37,347.75,351.13,SUM,NONE,NONE,COUNTS
0.032,0,0.02288441,0.02891912,0.03595322,53,2017015,10:42:38,2
0.032,0,-0.00014323,0.00024047,0.00001585,212,2017015,10:42:38,6
0.032,0,0.00114258,0.00091736,-0.0000495,16,2017015,10:42:39,9
0.032,0,0.00020744,0.0004186,0.00027721,118,2017015,10:42:40,16
,,,,,,,,11310
,,,,,,,,
INTTIME ('sec'),SAMPLE ('sec'),ES ('uW/cm^2/nm'),ES ('uW/cm^2/nm'),ES ('uW/cm^2/nm'),CHECK (''),DATETAG (NONE),TIMETAG2 (NONE),POSFRAME (NONE)
ES,DELAY,344.83,348.23,351.62,SUM,NONE,NONE,COUNTS
0.032,0,56.7600789,59.43147464,62.83968564,186,2017015,10:42:38,3
0.032,0,56.27202003,59.52654061,62.86815706,29,2017015,10:42:38,4
,,,,,,,,11309
,,,,,,,,11311
,,,,,,,,
INTTIME ('sec'),SAMPLE ('sec'),LU ('uW/cm^2/nm/sr'),LU ('uW/cm^2/nm/sr'),LU ('uW/cm^2/nm/sr'),CHECK (''),DATETAG (NONE),TIMETAG2 (NONE),POSFRAME (NONE)
LU,DELAY,344.37,347.75,351.13,SUM,NONE,NONE,COUNTS
0.032,0,-0.00011611,-0.00039544,-0.00014584,3,2017015,10:42:42,20
0.032,0,-0.00032394,-0.00020563,-0.00020383,229,2017015,10:42:46,39
This is what the two dataframes should look like in the end:
Dataframe 1
ES,DELAY,344.83,348.23,351.62,SUM,NONE,NONE,COUNTS
0.032,0,0.35441789,-0.00060208,0.10290995,87,2017015,10:42:39,1
0.032,0,-0.36023974,-0.22242269,-0.09639,109,2017015,10:42:40,10
0.032,0,0.07552711,0.01524224,-0.16756855,91,2017015,10:42:48,41
0.032,0,56.7600789,59.43147464,62.83968564,186,2017015,10:42:38,3
0.032,0,56.27202003,59.52654061,62.86815706,29,2017015,10:42:38,4
Dataframe 2
LU,DELAY,344.37,347.75,351.13,SUM,NONE,NONE,COUNTS
0.032,0,0.02288441,0.02891912,0.03595322,53,2017015,10:42:38,2
0.032,0,-0.00014323,0.00024047,0.00001585,212,2017015,10:42:38,6
0.032,0,0.00114258,0.00091736,-0.0000495,16,2017015,10:42:39,9
0.032,0,0.00020744,0.0004186,0.00027721,118,2017015,10:42:40,16
0.032,0,-0.00011611,-0.00039544,-0.00014584,3,2017015,10:42:42,20
0.032,0,-0.00032394,-0.00020563,-0.00020383,229,2017015,10:42:46,39

Here is a way you can solve this with tidyverse tools.
readr for reading/writing csv files
dplyr for data frame manipulation
stringr for string manipulation
library(readr)
library(dplyr)
library(stringr)
df_1 <- read_csv("test1.csv", col_names = FALSE, col_types = cols(.default = "c"), skip = 3)
First remove the rows with all values missing, or all but the last, and rows with the extra headers.
Then create a new column with the ES or LU values, otherwise NA, and then use tidyr::fill to fill down those values.
Then change the two columns with NONE to DATE and TIME because later we don't want two columns with the same name.
df_2 <- df_1 %>%
filter(!is.na(X1), !str_detect(X1, "INTTIME")) %>%
mutate(grp = if_else(X1 %in% c("ES", "LU"), X1, NA_character_)) %>%
tidyr::fill(grp, .direction = "down") %>%
mutate(X7 = str_replace(X7, "NONE", "DATE"),
X8 = str_replace(X8, "NONE", "TIME"))
df_2
#> # A tibble: 15 x 10
#> X1 X2 X3 X4 X5 X6 X7 X8 X9 grp
#> * <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 ES DELAY 344.83 348.23 351.62 SUM DATE TIME COUNTS ES
#> 2 0.032 0 0.35441789 -0.00060208 0.10290995 87 2017015 10:42:39 1 ES
#> 3 0.032 0 -0.36023974 -0.22242269 -0.09639 109 2017015 10:42:40 10 ES
#> 4 0.032 0 0.07552711 0.01524224 -0.16756855 91 2017015 10:42:48 41 ES
#> 5 LU DELAY 344.37 347.75 351.13 SUM DATE TIME COUNTS LU
#> 6 0.032 0 0.02288441 0.02891912 0.03595322 53 2017015 10:42:38 2 LU
#> 7 0.032 0 -0.00014323 0.00024047 0.00001585 212 2017015 10:42:38 6 LU
#> 8 0.032 0 0.00114258 0.00091736 -0.0000495 16 2017015 10:42:39 9 LU
#> 9 0.032 0 0.00020744 0.0004186 0.00027721 118 2017015 10:42:40 16 LU
#> 10 ES DELAY 344.83 348.23 351.62 SUM DATE TIME COUNTS ES
#> 11 0.032 0 56.7600789 59.43147464 62.83968564 186 2017015 10:42:38 3 ES
#> 12 0.032 0 56.27202003 59.52654061 62.86815706 29 2017015 10:42:38 4 ES
#> 13 LU DELAY 344.37 347.75 351.13 SUM DATE TIME COUNTS LU
#> 14 0.032 0 -0.00011611 -0.00039544 -0.00014584 3 2017015 10:42:42 20 LU
#> 15 0.032 0 -0.00032394 -0.00020563 -0.00020383 229 2017015 10:42:46 39 LU
Now for each of ES and LU you can filter to just those records, then remove the new grp column, then use the first row as the column names, then remove those column header rows, and write to a new cleaned csv file.
df_es <- df_2 %>%
filter(grp == "ES") %>%
select(-grp) %>%
purrr::set_names(., .[1,]) %>%
filter(ES != "ES") %>%
write_csv("ES.csv")
df_es
#> # A tibble: 5 x 9
#> ES DELAY `344.83` `348.23` `351.62` SUM DATE TIME COUNTS
#> * <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 0.032 0 0.35441789 -0.00060208 0.10290995 87 2017015 10:42:39 1
#> 2 0.032 0 -0.36023974 -0.22242269 -0.09639 109 2017015 10:42:40 10
#> 3 0.032 0 0.07552711 0.01524224 -0.16756855 91 2017015 10:42:48 41
#> 4 0.032 0 56.7600789 59.43147464 62.83968564 186 2017015 10:42:38 3
#> 5 0.032 0 56.27202003 59.52654061 62.86815706 29 2017015 10:42:38 4
df_lu <- df_2 %>%
filter(grp == "LU") %>%
select(-grp) %>%
set_names(., .[1,]) %>%
filter(LU != "LU") %>%
write_csv("LU.csv")
df_lu
#> # A tibble: 6 x 9
#> LU DELAY `344.37` `347.75` `351.13` SUM DATE TIME COUNTS
#> * <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 0.032 0 0.02288441 0.02891912 0.03595322 53 2017015 10:42:38 2
#> 2 0.032 0 -0.00014323 0.00024047 0.00001585 212 2017015 10:42:38 6
#> 3 0.032 0 0.00114258 0.00091736 -0.0000495 16 2017015 10:42:39 9
#> 4 0.032 0 0.00020744 0.0004186 0.00027721 118 2017015 10:42:40 16
#> 5 0.032 0 -0.00011611 -0.00039544 -0.00014584 3 2017015 10:42:42 20
#> 6 0.032 0 -0.00032394 -0.00020563 -0.00020383 229 2017015 10:42:46 39

Related

Force the application of the right as_tibble function on an object in a purrr::map call in R

I am trying to use the fact that as_tibble is a generic function so that I can process a column containing one out of two types of objects in the same way.
The list column temp can store either a list or an object of type AsspDataObj.
I can define an as_tibble function for that class
library(dplyr)
as_tibble.AsspDataObj <- function(x,field=1, prefix=NULL,na.zeros=TRUE){
df <- data.frame(x[[field]])
if(is.null(prefix)){
if(is.numeric(field)){
prefix <- names(x)[field]
}else{
prefix <- field
}
}
colnames(df) <- paste(prefix,seq(1,ncol(df),1),sep="_")
times <- seq(from=attr(x,"startTime"),
by=1/attr(x,"sampleRate"),
length.out=nrow(df))
out <-
tibble(times_orig=times,
times_rel=seq(from=0,to=(attr(x,"endRecord")-1)* 1000/attr(x,"sampleRate") ,by=1000/attr(x,"sampleRate")),
times_norm=times_rel / (max(times_rel) - min(times_rel))
) %>%
dplyr::bind_cols(df)
if(na.zeros){
out <- out %>%
dplyr::mutate(across(!times_orig & !times_rel & !times_norm, ~ na_if(.,0)))
}
return(out)
}
and then apply the function to one of the stored objects and get the expected result.
> class(pluck(illustration, "temp",1))
[1] "AsspDataObj"
> as_tibble(pluck(illustration, "temp",1))
# A tibble: 581 × 7
times_orig times_rel times_norm fm_1 fm_2 fm_3 fm_4
<dbl> <dbl> <dbl> <int> <int> <int> <int>
1 0.0025 0 0 NA 1062 2073 3156
2 0.0075 5 0.00172 1239 2109 3113 4247
3 0.0125 10 0.00345 NA 1352 2316 3310
4 0.0175 15 0.00517 NA 1448 2555 3870
5 0.0225 20 0.00690 NA 1438 2564 3958
[...]
Now, I want to apply the function to each object, and expand (unnest) the output so that result is actually the as_tibble output columns for each stored AsspDataObj, but joined with the other columns in the illustration data set.
Now, if I do this (and simplify the output a but by removing some columns just for now)
> illustration %>% mutate(out = map(.x = temp,.f=as_tibble)) %>% select(sl_rowIdx,out) %>% unnest(out)
I get this output
# A tibble: 1,162 × 10
sl_rowIdx frame_time fm1 fm2 fm3 fm4 bw1 bw2 bw3 bw4
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2.5 0 1062 2073 3156 0 800 369 890
2 1 7.5 1239 2109 3113 4247 644 493 792 859
3 1 12.5 0 1352 2316 3310 0 486 762 933
4 1 17.5 0 1448 2555 3870 0 577 716 1442
5 1 22.5 0 1438 2564 3958 0 548 552 1062
6 1 27.5 0 1343 2518 4001 0 637 617 1096
which is not the output I would expect to see if the as_tibble generic above had been applied. Instead, it seems that some other as_tibble function has been applied to the object.
Why? How do I find out what as_tibble.* function has been applied here? How do I force the application of my definition?
You need a data excerpt https://umeauniversity-my.sharepoint.com/:u:/g/personal/frkkan96_ad_umu_se/ET_56GqikHxAuFc60y_ce1UBL0lOOJcqWaMDjDwLOxjuOw?e=IsEZPf
Sorry, I found the answer. The problem was in the lexical scoping of R (I think) with a (not exported) definition of as_table.AsspDataObj defined in the package where the AsspDataObj was defined
https://github.com/IPS-LMU/wrassp/blob/cbbc6e9fe100f5f32f7b30510f3008c5a5553440/R/AsspDataObj.R#L357
being called instead of my function.
Confusing.

time between max and min of cycles

I have a series of data of 60,000 data which part of the data is as the figure 1 (the whole curve is not so nice and uniform like this image (some other part of data is as second image)) but there are many cycles with different period in my data.
I need to calculate the time of three red, green and purple rectangles for each of the cycles (** the time between each maximum and minimum and total time of cycles **)
Can you give me some ideas on how to do it in R ... is there any special command or package that I can use?
Premise is that the mean value of the data range is used to split the data into categories of peaks and not peaks. Then a running id is generated to group each set of data so an appropriate min or max value can be determined. The half_cycle provides the red and green boxes, while full_cycle provides the purple box for max-to-max and min-to-min. There is likely room for improvement, but it gives a method that can be adjusted as needed.
This sample uses random data since no sample data was provided.
set.seed(7)
wave <- c(seq(20, 50, 10), seq(50, 60, 0.5), seq(50, 20, -10))
df1 <- data.frame(time = seq_len(length(wave) * 5),
data = as.vector(replicate(5, wave + rnorm(length(wave), sd = 5))))
library(dplyr)
df1 %>%
mutate(peak = data > mean(range(df1$data))) %>%
mutate(run = cumsum(peak != lag(peak, default = TRUE))) %>%
group_by(run) %>%
mutate(max = max(data), min = min(data)) %>%
filter((peak == TRUE & data == max) | (peak == FALSE & data == min)) %>%
mutate(max = if_else(data == max, max, NULL), min = if_else(data == min, min , NULL)) %>%
ungroup() %>%
mutate(half_cycle = time - lag(time), full_cycle = time - lag(time, n = 2L))
# A tibble: 11 x 8
time data peak run max min half_cycle full_cycle
<int> <dbl> <lgl> <int> <dbl> <dbl> <int> <int>
1 2 24.0 FALSE 1 NA 24.0 NA NA
2 12 67.1 TRUE 2 67.1 NA 10 NA
3 29 15.1 FALSE 3 NA 15.1 17 27
4 54 68.5 TRUE 4 68.5 NA 25 42
5 59 20.8 FALSE 5 NA 20.8 5 30
6 80 70.6 TRUE 6 70.6 NA 21 26
7 87 18.3 FALSE 7 NA 18.3 7 28
8 108 63.1 TRUE 8 63.1 NA 21 28
9 117 13.8 FALSE 9 NA 13.8 9 30
10 140 64.5 TRUE 10 64.5 NA 23 32
11 145 22.4 FALSE 11 NA 22.4 5 28

How to change all values expect for the top values (by frequency) from a categorical variable in R

I have a data frame in R which looks similar to the one below, with the factor variable "Genre":
|Genre|Listening Time|
|Rock |1:05 |
|Pop |3:10 |
|RnB |4:12 |
|Rock |2:34 |
|Pop |5:01 |
|RnB |4:01 |
|Rock |1:34 |
|Pop |2:04 |
I want leave the top 15 genres (by count) as they are and only rename all other genres that are not among the top 15. Those should be renamed into the word "Other".
In other words - if for example the Genre "RnB" is not among the top 15 Genres, then it should be replaced by the word "Other".
The table I would like to get would look like this then:
|Genre|Listening Time|
|Rock |1:05 |
|Pop |3:10 |
|Other|4:12 |
|Rock |2:34 |
|Pop |5:01 |
|Other|4:01 |
|Rock |1:34 |
|Pop |2:04 |
How would I approach this?
Thank you!
If you want to look into tidyverse you may do something like this. I have tried to mimic your data frame but added some more rows.
You start with data > group_by Genre > order > chose top 5
library(tidyverse)
set.seed(1)
Data <- data.frame(
listen = format(as.POSIXlt(paste0(
as.character(sample(1:5)),
':',
as.character(sample(0:59))), format = '%H:%M'),format = '%H:%M'),
Genre = sample(c("Rock", "Pop", 'RnB'), 120, replace = TRUE)
)
Data %>%
group_by(Genre ) %>%
arrange(desc(listen)) %>%
select(listen) %>%
top_n(5) %>%
arrange(Genre)
#> Adding missing grouping variables: `Genre`
#> Selecting by listen
#> # A tibble: 15 x 2
#> # Groups: Genre [3]
#> Genre listen
#> <chr> <chr>
#> 1 Pop 05:47
#> 2 Pop 05:47
#> 3 Pop 05:43
#> 4 Pop 05:41
#> 5 Pop 05:28
#> 6 RnB 05:54
#> 7 RnB 05:44
#> 8 RnB 05:43
#> 9 RnB 05:29
#> 10 RnB 05:28
#> 11 Rock 05:54
#> 12 Rock 05:44
#> 13 Rock 05:41
#> 14 Rock 05:29
#> 15 Rock 05:26
Sorry, if I have misunderstood what you wanted. If you assign the code to a new data.frame and make an anti_join to the original DF and then mutate Genre to others it should be what you want - I guess.
df <- Data %>%
group_by(Genre ) %>%
arrange(desc(listen)) %>%
select(listen) %>%
top_n(5) %>%
arrange(Genre)
# make an anti_join and assign 'other' to Genre
anti_join(Data, df) %>%
mutate(Genre = 'others')
Next Edit
Hopefully I have now understood your question. You want just to count how often the Genres occure in your data and give those which do not belong to the top 15 the name Others. Maybe I was mislead by the data frame you offered which shows only 3 Genres. So I looked up in Wikipedia and added a few, invented some own Genres and used LETTERS to build up a DF with sufficient numbers of Genre.
With count(Genre) the occurences of Genres are counted, and then arranged in descending order. I have then introduced a new column with the row numbers. You can delete this if you want, as it is only there to do the next step which is introducing another column - I have chosen to make a new column, instead of renaming all the names in Genre - with the name Top15 an giving every Genre which is on place(in row) 16 or later the name Others and keeping the rest unchanged.
head(20) just prints the first 20 rows of this DF.
library(tidyverse)
set.seed(1)
Data <- data.frame(
listen = format(as.POSIXlt(paste0(
as.character(sample(1:5)),
':',
as.character(sample(0:59))), format = '%H:%M'),format = '%H:%M'),
Genre = sample(c("Rock", "Pop", 'RnB', 'Opera',
'Birthday Songs', 'HipHop',
'Chinese Songs', 'Napoli Lovesongs',
'Benga', 'Bongo', 'Kawito', 'Noise',
'County Blues','Mambo', 'Reggae',
LETTERS[0:24]), 300, replace = TRUE)
)
Data %>% count(Genre) %>%
arrange(desc(n)) %>%
mutate(place = row_number()) %>%
mutate(Top15 = ifelse(place > 15, 'Others', Genre)) %>%
head(20)
#> # A tibble: 20 x 4
#> Genre n place Top15
#> <chr> <int> <int> <chr>
#> 1 N 15 1 N
#> 2 T 13 2 T
#> 3 V 13 3 V
#> 4 K 12 4 K
#> 5 Rock 11 5 Rock
#> 6 X 11 6 X
#> 7 E 10 7 E
#> 8 W 10 8 W
#> 9 Benga 9 9 Benga
#> 10 County Blues 9 10 County Blues
#> 11 G 9 11 G
#> 12 J 9 12 J
#> 13 M 9 13 M
#> 14 Reggae 9 14 Reggae
#> 15 B 8 15 B
#> 16 D 8 16 Others
#> 17 I 8 17 Others
#> 18 P 8 18 Others
#> 19 R 8 19 Others
#> 20 S 8 20 Others
I hope this was what you were looking for
I can think of a data.table solution. Let's assume your data.frame is called music, then:
library(data.table)
setDT(music)
other_genres <- music[, .N, by = genre][order(-N)][16:.N, genre]
music[genre %chin% other_genres, genre := "other"]
The first line of effective code counts the appearances by genre, sorts it from largest to smallest and selects from the 16 down to the last one, assigning the result to a variable called other_genres.
The second line will check which genres are in that list, and update their name to "other".
library(dplyr)
set.seed(123)
compute_listen_time <- function(n.songs) {
min <- sample(1:15, n.songs, replace = TRUE)
sec <- sample(0:59, n.songs, replace = TRUE)
sec <- ifelse(sec > 10, sec, paste0("0", sec))
paste0(min, ":", sec)
}
df <- data.frame(
Genre = sample(c("Rock", "Pop", "RnB", "Rock", "Pop"), 100, replace = TRUE),
Listen_Time = compute_listen_time(100)
)
df <- add_count(df, Genre, name = "count") %>%
mutate(
rank = dense_rank(desc(count)),
group = ifelse(rank <= 15, Genre, "other")
)
df
There is a pretty neat solution with the forcats package applied here to the diamonds dataset to only name the top 5 clarity values and bundle the rest as "Other"
library(dplyr)
library(forcats)
diamonds %>%
mutate(clarity2 = fct_lump(fct_infreq(clarity), n = 5))
Result:
# A tibble: 53,940 x 11
carat cut color clarity depth table price x y z clarity2
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <ord>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 SI2
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 SI1
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 VS1
4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63 VS2
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 SI2
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 VVS2
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 Other
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 SI1
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 VS2
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 VS1
# … with 53,930 more rows
Try replacing df with your data.frame to check if you get the desired output :
df <- data.frame(Genre=sample(letters, 1000, replace=TRUE),
ListeningTime=runif(1000, 3, 5))
> head(df)
Genre ListeningTime
1 j 3.437013
2 n 4.151121
3 p 3.109044
4 z 4.529619
5 h 4.043982
6 i 3.590463
freq <- table(df$Genre)
sorted <- sort(freq, decreasing=TRUE) # Sorted by frequency of df$Genre
> sorted
d x o q r u g i j f a p b e v n w c k m z l h t y s
53 50 46 45 45 42 41 41 40 39 38 38 37 37 37 36 36 35 35 35 35 34 33 33 30 29
not_top_15 <- names(sorted[-1*1:15]) # The Genres not in the top 15
pos <- which(df$Genre %in% not_top_15) # Their position in df
> head(df[pos, ]) # The original data, without the top 15 Genres
Genre ListeningTime
2 n 4.151121
4 z 4.529619
5 h 4.043982
7 s 3.521054
16 w 3.528091
18 h 4.588815

Rank subset into quantiles using Ntile

I have a dataset containing 42840 observations with a total of 119 unique months (Dataset$date). The idea is that i want to assign a quantile to every dataset$Value within each month, and 'rank' them from 1(lowest value) to 5(highest value).
Date Name(ID) Value Quantile (I want to add this column where i assign the values a quantile from 1 to 5)
2009-03 1 35 (1-5)
2009-04 1 20 ...
2009-05 1 65 ...
2009-03 2 24 ...
2009-04 2 77 ...
2009-03 3 110 ...
.
.
.
2018-12 3 125 ...
2009-03 56 24 ...
2009-04 56 65 ...
2009-03 57 26 ...
2009-04 57 67 ...
2009-03 58 99 ...
I've tried to use the Ntile function, which works great for the whole dataset but there doesn't seem to be a function where I can specify for a subset of date.
Any suggestions?
You could use the base rank function with dplyr's group_by:
library(dplyr)
# Create some data
N <- 3
dat <- tibble(
date = rep(1:12,N),
value = runif(12*N, 0, 100)
)
# The rescale function we will use later to fit on your 1-5 scale
## Adapted From https://stackoverflow.com/questions/25962508/rescaling-a-variable-in-r
RESCALE <- function (x, nx1, nx2, minx, maxx) {
nx = nx1 + (nx2 - nx1) * (x - minx)/(maxx - minx)
return(ceiling(nx))
}
# What you want
dat %>%
group_by(date) %>% # Group the data by Date so that mutate fill compute the rank's for each Month
mutate(rank_detail = rank(value), # ranks the values within each group
rank_group = RESCALE(rank_detail, 1, 5, min(rank_detail), max(rank_detail)) ) %>% # rescales the ranking to be on you 1 to 5 scale
arrange(date)
# A tibble: 36 x 4
# # Groups: date [12]
# date value rank_detail rank_group
# <int> <dbl> <dbl> <dbl>
# 1 1 92.7 3 5
# 2 1 53.6 2 3
# 3 1 47.8 1 1
# 4 2 24.6 2 3
# 5 2 72.2 3 5
# 6 2 11.5 1 1

Reading fixed width format data into R with entries exceeding column width

I need to use the Annual Building Permits by Metropolitan Area Data distributed by the US Census Bureau, which are downloadable here as fixed width format text files. Here is an excerpt of the file (I've stripped the column names as they aren't in a nice format and can be replaced after reading the file into a date frame):
999 10180 Abilene, TX 306 298 8 0 0 0
184 10420 Akron, OH 909 905 0 4 0 0
999 13980 Blacksburg-Christiansburg-Radford,
VA 543 455 0 4 84 3
145 14010 Bloomington, IL 342 214 4 0 124 7
160 15380 Buffalo-Cheektowaga-Niagara Falls,*
NY 1964 931 14 14 1005 68
268 15500 Burlington, NC 1353 938 12 16 387 20
As seen in the above excerpt, many of the entries in the Name column exceed the width of the column (which looks to be 36 characters). I've experimented with the various fwf reading functions of both the utils package and readr but can't find a solution that takes these entries into account. Any tips would be much appreciated.
Edit: The original file excerpt was edited by a mod for formatting and in the process the example entries where the third column width was exceeded were deleted. I've since updated the excerpt to reinclude them and have stripped the column names.
I ran #markdly 's code, which was submitted before this edit, works for all the entries that don't have this issue. I exported the result to a csv, and included an excerpt below to show what happens with these entries:
"38","999",NA,"13980",NA,"Blacksburg-Christiansburg-Radford,",NA,NA,NA,NA,NA,NA
"39","V","A",NA,NA,NA,"543",455,0,4,84,3
"40","145",NA,"14010",NA,"Bloomington, IL","342",214,4,0,124,7
"51","160",NA,"15380",NA,"Buffalo-Cheektowaga-Niagara Falls,*",NA,NA,NA,NA,NA,NA
"52","N","Y",NA,NA,NA,"1964",931,14,14,1005,68
"53","268",NA,"15500",NA,"Burlington, NC","1353",938,12,16,387,20
Edit 2: Most of the major metro areas I'm actually looking at don't fall into this problem category, so while it would be nice to have the data for the ones that do, if there is no workable solution, would there be a way to remove these entries from the data set altogether?
Edit:
Based on the updated information, the files are not fixed width for some records. In this situation, I think readr::read_table is more useful than read_fwf. The following example is a tidyverse approach to importing and processing one of the source files (tb3u2016.txt). A base approach might involve using something like readLines.
Step 1 Read the file in and assign the split records a common record id
library(tidyverse)
df <- read_table("tb3u2016.txt", col_names = FALSE, skip = 11) %>%
rownames_to_column() %>%
mutate(record = if_else(lag(is.na(X2) & rowname > 1), lag(rowname), rowname))
df[37:40, ]
#> # A tibble: 4 x 8
#> rowname X1 X2
#> <chr> <chr> <int>
#> 1 37 999 13900 Bismarck, ND 856 629
#> 2 38 999 13980 Blacksburg-Christiansburg-Radford, NA
#> 3 39 VA 543 455
#> 4 40 145 14010 Bloomington, IL 342 214
#> # ... with 5 more variables: X3 <int>, X4 <int>, X5 <int>, X6 <int>,
#> # record <chr>
Step 2 Combine the split record text then put the contents into separate variables using tidyr::extract. Trim whitespace and remove the redundant records.
df <- df %>%
mutate(new_X1 = if_else(rowname != record, paste0(lag(X1), X1), X1)) %>%
extract(new_X1, c("CSA", "CBSA", "Name", "Total"), "([0-9]+) ([0-9]+) (.+) ([0-9]+)") %>%
mutate(Name = trimws(Name)) %>%
filter((lead(record) != record) | rowname == 1) %>%
select(CSA, CBSA, Name, Total, X2, X3, X4, X5, X6)
df[37:39, ]
#> # A tibble: 3 x 9
#> CSA CBSA Name Total X2 X3 X4
#> <chr> <chr> <chr> <chr> <int> <int> <int>
#> 1 999 13900 Bismarck, ND 856 629 16 6
#> 2 999 13980 Blacksburg-Christiansburg-Radford,VA 543 455 0 4
#> 3 145 14010 Bloomington, IL 342 214 4 0
#> # ... with 2 more variables: X5 <int>, X6 <int>
Below is a condensed version of the solution provided to an earlier version of the question using readr::read_fwf.
Example data
library(readr)
# example data
txt <- " Num of
Struc-
tures
With
3 and 4 5 Units 5 Units
CSA CBSA Name Total 1 Unit 2 Units Units or more or more
999 10180 Abilene, TX 306 298 8 0 0 0
184 10420 Akron, OH 909 905 0 4 0 0"
write_file(txt, "example.txt")
Solution
col_widths <- c(3, 1, 5, 1, 36, 8, 8, 8, 8, 8, NA)
col_names <- c("CSA", "blank_1", "CBSA", "blank_2", "Name", "Total", "units_1", "units_2",
"units_3_and_4", "units_5_or_more", "num_struc_5_or_more")
df <- read_fwf("example.txt", fwf_widths(col_widths, col_names), skip = 7)
df
#> # A tibble: 2 x 11
#> CSA blank_1 CBSA blank_2 Name Total units_1 units_2
#> <int> <chr> <int> <chr> <chr> <int> <int> <int>
#> 1 999 <NA> 10180 <NA> Abilene, TX 306 298 8
#> 2 184 <NA> 10420 <NA> Akron, OH 909 905 0
#> # ... with 3 more variables: units_3_and_4 <int>, units_5_or_more <int>,
#> # num_struc_5_or_more <int>

Resources