sequence by reducing data.table - r

require(data.table)
set.seed(333)
t <- data.table(old=1002:2001, dif=sample(1:10,1000, replace=TRUE))
t$new <- t$old + t$dif; t$foo <- rnorm(1000); t$dif <- NULL
i <- data.table(id=1:3, start=sample(1000:1990,3))
> i
id start
1: 1 1002
2: 2 1744
3: 3 1656
> head(t)
old new foo
1: 1002 1007 -0.7889534
2: 1003 1004 0.3901869
3: 1004 1014 0.7907947
4: 1005 1011 2.0964612
5: 1006 1007 1.1834171
6: 1007 1015 1.1397910
I would like to delete time points from points such that only those rows remain where new[i] = old[i-1], giving a continuous sequence of some fixed number of time points. Ideally, this would be done for all id in i simultaneously, where start gives the starting points. For example, if we choose n=5, we should obtain
> head(ans)
id old new foo
1: 1 1002 1007 -0.7889534
2: 1 1007 1015 1.1397910
3: 1 1015 1022 -1.2193670
4: 1 1022 1024 1.2039050
5: 1 1024 1026 0.4388586
6: 2 1744 1750 -0.1368320
where lines 3 to 6 cannot be inferred above and foo is a stand in for other variables that need to be kept.
Can this be done efficiently in data.table, for example, using a clever combination of joins?
PS. This question is somewhat similar to an an earlier one of mine but I have modified the situation to make it clearer.

It seems to me that you need help from graph algorithms. If you want to start with 1002, you can try:
require(igraph)
g <- graph_from_edgelist(as.matrix(t[,1:2]))
t[old %in% subcomponent(g,"1002","out")]
# 1: 1002 1007 -0.78895338
# 2: 1007 1015 1.13979100
# 3: 1015 1022 -1.21936662
# 4: 1022 1024 1.20390482
# 5: 1024 1026 0.43885860
# ---
#191: 1981 1988 -0.22054875
#192: 1988 1989 -0.22812175
#193: 1989 1995 -0.04687776
#194: 1995 2000 2.41349730
#195: 2000 2002 -1.23425666
Of course you can do the above for each start you want and limiting the results for the first n rows. For instance, we can lapply over the i$start positions and then rbindlist all the values together, declaring an id column with the i$id values. Something like:
n <- 5
rbindlist(
setNames(lapply(i$start, function(x) t[old %in% subcomponent(g,x,"out")[1:n]]), i$id),
idcol="id")
# id old new foo
# 1: 1 1002 1007 -0.7889534
# 2: 1 1007 1015 1.1397910
# 3: 1 1015 1022 -1.2193666
# 4: 1 1022 1024 1.2039048
# 5: 1 1024 1026 0.4388586
# 6: 2 1744 1750 -0.1368320
# 7: 2 1750 1758 0.3331686
# 8: 2 1758 1763 1.3040357
# 9: 2 1763 1767 -1.1715528
#10: 2 1767 1775 0.2841251
#11: 3 1656 1659 -0.1556208
#12: 3 1659 1663 0.1663042
#13: 3 1663 1669 0.3781835
#14: 3 1669 1670 0.2760948
#15: 3 1670 1675 0.3745026

Related

Filling column if information is available in another row

I have data as follows:
dat <- structure(list(ZIP_source1 = c(1026, 1026, 1026, 1026, 1026,
1026, 1026, 1026, 1026, 1026, 1017, 1012, 1012), ZIP_source2 = c(1026,
1026, 1026, 1026, 1026, 1026, NA, NA, NA, NA, NA, 1012, 1012),
Category_source2 = c(4, 4, 4, 4, 4, 4, NA, NA, NA, NA, NA, 4, 4)), class = c("data.table",
"data.frame"), row.names = c(NA, -13L))
dat
ZIP_source1 ZIP_source2 Category_source2
1: 1016 1016 4
2: 1016 1016 4
3: 1016 1016 4
4: 1016 1016 4
5: 1016 1016 4
6: 1016 1016 4
7: 1016 NA NA
8: 1016 NA NA
9: 1016 NA NA
10: 1016 NA NA
11: 1027 NA NA
12: 1022 1022 4
13: 1022 1022 4
For line 7 to 10, I know from source 1 what the zip code is. From source 2 I know that this zip code
falls in category 4. What is the best way to do this?
Desired output:
ZIP_source1 ZIP_source2 Category_source2
1: 1016 1016 4
2: 1016 1016 4
3: 1016 1016 4
4: 1016 1016 4
5: 1016 1016 4
6: 1016 1016 4
7: 1016 NA 4
8: 1016 NA 4
9: 1016 NA 4
10: 1016 NA 4
11: 1027 NA NA
12: 1022 1022 4
13: 1022 1022 4
We can use fill
library(dplyr)
library(tidyr)
dat %>%
group_by(ZIP_source1) %>%
fill(Category_source2, .direction = "downup")
Or using nafill
library(data.table)
dat[, Category_source2 := nafill(nafill(Category_source2,
type = "locf"), type = "nocb"), ZIP_source1]
-output
> dat
ZIP_source1 ZIP_source2 Category_source2
<num> <num> <num>
1: 1026 1026 4
2: 1026 1026 4
3: 1026 1026 4
4: 1026 1026 4
5: 1026 1026 4
6: 1026 1026 4
7: 1026 NA 4
8: 1026 NA 4
9: 1026 NA 4
10: 1026 NA 4
11: 1017 NA NA
12: 1012 1012 4
13: 1012 1012 4
I'd prefer to create new columns to do this, which I will call zip and category, but it's straightforward to overwrite the original columns if you want.
# Get all zips where not NA in one column
dat <- dat %>%
mutate(
zip = coalesce(ZIP_source1, ZIP_source2)
)
# Create table of all categories
category_table <- dat %>%
select(Category_source2, zip) %>%
drop_na() %>%
group_by(zip) %>%
distinct() %>%
rename(category = Category_source2)
category_table
# category zip
# <dbl> <dbl>
# 1 4 1026
# 2 4 1012
# Join as new column
left_join(dat, category_table, by = "zip")
# left_join(dat, category_table, by = "zip")
# ZIP_source1 ZIP_source2 Category_source2 zip category
# 1 1026 1026 4 1026 4
# 2 1026 1026 4 1026 4
# 3 1026 1026 4 1026 4
# 4 1026 1026 4 1026 4
# 5 1026 1026 4 1026 4
# 6 1026 1026 4 1026 4
# 7 1026 NA NA 1026 4
# 8 1026 NA NA 1026 4
# 9 1026 NA NA 1026 4
# 10 1026 NA NA 1026 4
# 11 1017 NA NA 1017 NA
# 12 1012 1012 4 1012 4
# 13 1012 1012 4 1012 4

igraph, POSIX, and data.table

In an earlier question, I learned that graphs are useful to collapse these data
require(data.table)
set.seed(333)
t <- data.table(old=1002:2001, dif=sample(1:10,1000, replace=TRUE))
t$new <- t$old + t$dif; t$foo <- rnorm(1000); t$dif <- NULL
> head(t)
old new foo
1: 1002 1007 -0.7889534
2: 1003 1004 0.3901869
3: 1004 1014 0.7907947
4: 1005 1011 2.0964612
5: 1006 1007 1.1834171
6: 1007 1015 1.1397910
to obtain only those rows such that new[i] = old[i-1]. The result could then be joined into a table with users who each have their own starting points
i <- data.table(id=1:3, start=sample(1000:1990,3))
> i
id start
1: 1 1002
2: 2 1744
3: 3 1656
Specifically, when only the first n=3 steps are calculated, the solution was
> library(igraph)
> i[, t[old %in% subcomponent(g, start, "out")[1:n]], by=.(id)]
id old new foo
1: 1 1002 1007 -0.7889534
2: 1 1007 1015 1.1397910
3: 1 1015 1022 -1.2193666
4: 2 1744 1750 -0.1368320
5: 2 1750 1758 0.3331686
6: 2 1758 1763 1.3040357
7: 3 1656 1659 -0.1556208
8: 3 1659 1663 0.1663042
9: 3 1663 1669 0.3781835
When implementing this when the setup is the same but new, old, and start are POSIXct class,
set.seed(333)
u <- data.table(old=seq(from=as.POSIXct("2013-01-01"),
to=as.POSIXct("2013-01-02"), by="15 mins"),
dif=as.difftime(sample(seq(15,120,15),97,replace=TRUE),units="mins"))
u$new <- u$old + u$dif; u$foo <- rnorm(97); u$dif <- NULL
j <- data.table(id=1:3, start=sample(seq(from=as.POSIXct("2013-01-01"),
to=as.POSIXct("2013-01-01 22:00:00"), by="15 mins"),3))
> head(u)
old new foo
1: 2013-01-01 00:00:00 2013-01-01 01:00:00 -1.5434407
2: 2013-01-01 00:15:00 2013-01-01 00:30:00 -0.2753971
3: 2013-01-01 00:30:00 2013-01-01 02:30:00 -1.5986916
4: 2013-01-01 00:45:00 2013-01-01 02:00:00 -0.6288528
5: 2013-01-01 01:00:00 2013-01-01 01:15:00 -0.8967041
6: 2013-01-01 01:15:00 2013-01-01 02:45:00 -1.2145590
> j
id start
1: 1 2013-01-01 22:00:00
2: 2 2013-01-01 21:00:00
3: 3 2013-01-01 13:30:00
the command
> j[, u[old %in% subcomponent(h, V(h)$name %in% as.character(start), "out")[1:n]], by=.(id)]
Empty data.table (0 rows and 4 cols): id,old,new,foo
returns an empty vector, which appears to be due to the inner part u[...]. I do not quite see where the problem is in this case and wonder whether anyone spots a mistake.

How to create a new table given X and Y from a data frame

I'm trying to get a new dataset where it can take two columns and make a new table based on a calculation of a third column.
Cust T S1 S2 S3 S4
1009 150 1007 1006 1001 1000
1010 50 1007 1006 1001 1000
1011 50 1007 1006 1001 1000
1013 10000 1007 1006 1001 1000
1931 60 1008 1007 1006 1005
1141 1000 1014 1013 1007 1006
I need to make a new table where it is:
Cust 1014 1013 1008 1007 1006 1001 1000
1009 NA NA NA T *.1 T *.1 T*.05 T * .025
1010 NA NA NA T *.1 T *.1 T*.05 T * .025
1011 NA NA NA T *.1 T *.1 T*.05 T * .025
1013 NA NA NA T *.1 T *.1 T*.05 T * .025
1931 NA NA T*.1 T *.1 T*.05 T * .025 NA
1141 T*.1 T *.1 NA T*.05 T * .025 NA NA
I just can't seem to figure it out and I'm not even sure if it is possible.
A tidyverse solution:
library(tidyverse)
df %>% gather(select = -c(Cust, T)) %>%
select(-key) %>%
spread(value, T) %>%
map2_dfc(c(1, .025, .05, rep(.1, 6)), ~ .x * .y)
# Cust `1000` `1001` `1005` `1006` `1007` `1008` `1013` `1014`
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1009 3.75 7.5 NA 15 15 NA NA NA
# 2 1010 1.25 2.5 NA 5 5 NA NA NA
# 3 1011 1.25 2.5 NA 5 5 NA NA NA
# 4 1013 250 500 NA 1000 1000 NA NA NA
# 5 1141 NA NA NA 100 100 NA 100 100
# 6 1931 NA NA 6 6 6 6 NA NA
library(dplyr)
library(tidyr)
library(data.table)
df %>% gather(key=k,value = val, -c('Cust','T')) %>%
mutate(val_upd=ifelse(k=='S1'|k=='S2','T*.1',ifelse(k=='S3','T*.05','T*.025'))) %>%
#Change 'T*.1' to T*.1 to get the actual value
select(-T,-k) %>% dcast(Cust~val,value.var='val_upd')
Cust 1000 1001 1005 1006 1007 1008 1013 1014
1 1009 T*.025 T*.05 <NA> T*.1 T*.1 <NA> <NA> <NA>
2 1010 T*.025 T*.05 <NA> T*.1 T*.1 <NA> <NA> <NA>
3 1011 T*.025 T*.05 <NA> T*.1 T*.1 <NA> <NA> <NA>
4 1013 T*.025 T*.05 <NA> T*.1 T*.1 <NA> <NA> <NA>
5 1141 <NA> <NA> <NA> T*.025 T*.05 <NA> T*.1 T*.1
6 1931 <NA> <NA> T*.025 T*.05 T*.1 T*.1 <NA> <NA>
Data
df <- read.table(text = "
Cust T S1 S2 S3 S4
1009 150 1007 1006 1001 1000
1010 50 1007 1006 1001 1000
1011 50 1007 1006 1001 1000
1013 10000 1007 1006 1001 1000
1931 60 1008 1007 1006 1005
1141 1000 1014 1013 1007 1006
", header=TRUE)
This is one way using a combination of reshape2::melt, dplyr::select, tidyr::spread and dplyr::mutate. May not be the best way, but it should do what you want:
# Read the data (if you don't already have it loaded)
df <- read.table(text="Cust T S1 S2 S3 S4
1009 150 1007 1006 1001 1000
1010 50 1007 1006 1001 1000
1011 50 1007 1006 1001 1000
1013 10000 1007 1006 1001 1000", header=T)
# Manipulate your data.frame. Replace df with the name of your data.frame
reshape2::melt(df, c("Cust", "T"), c("S1", "S2", "S3", "S4")) %>%
dplyr::select(-variable) %>%
tidyr::spread(value, T) %>%
dplyr::mutate(`1007`=`1007`*0.1,
`1006`=`1006`*0.1,
`1001`=`1001`*0.05,
`1000`=`1000`*0.025)
# Cust 1000 1001 1006 1007
#1 1009 3.75 7.5 15 15
#2 1010 1.25 2.5 5 5
#3 1011 1.25 2.5 5 5
#4 1013 250.00 500.0 1000 1000
You'll need the backticks as R doesn't handle having numeric colnames very well.
Let me know if I've misunderstood anything/something doesn't make sense

R: How to create a conditional column indirectly based on a non-static amount of other columns, in a data.table

I have the following data.table:
Name x y h 120Hz 800Hz 1000Hz 1200Hz
1: Tower1 1354 829 245 0 8 7 0
2: Tower2 2654 234 285 7 0 3 0
3: Tower3 822 3040 256 0 4 0 9
4: Tower4 987 2747 250 0 6 5 3
5: Tower5 1953 1739 301 0 0 8 2
You can create it with:
DT <- data.table(Name = c("Tower1", "Tower2", "Tower3", "Tower4", "Tower5"),
x = c(1354,2654,822,987,1953),
y = c(829,234,3040,2747,1739),
h = c(245,285,256,250,301),
`120Hz` = c(0,7,0,0,0),
`800Hz` = c(8,0,4,6,0),
`1000Hz` = c(7,3,0,5,8),
`1200Hz` = c(0,0,9,3,2))
In reality, it came from a previous, larger data.table. The last four columns were auto-generated from that other data.table using dcast, so there is no way to know beforehand the number or the names of the columns after column h. This is important.
The final goal is to create another column named "Range", whose value for each row depends on the values in the columns after column "h", as it follows:
Consider the following associations between frequencies and ranges. These are the only stablished associations and are static, so this information could be stored as a pre-defined data.table.
assoc <- data.table(Frq = c("800Hz", "1000Hz", "1200Hz"),
Rng = c(750,850,950))
For each one of the four columns after column "h", the code should check if the column name exists in assoc. If so, AND if the value in that column for the row in question in DT is NOT zero, then the code considers the respective Rng value (from assoc). After checking all four columns, the code should return the MAXIMUM of the ranges considered and store in the new column "Range".
My approach:
Create one auxiliar column for each frequency column:
DT <- DT[, paste0(colnames(DT)[5:ncol(DT)],'_r') := 0]
Then I could use a conditional structure that does the algorithm described above. Let's take for example column 800Hz_r. This column checks the value in column 800Hz. If that value is not zero for the row in question, then it returns 750. At the end, the column Range simply takes the maximum of the previous 4 columns, the ones ending with _f. There's where I'm stuck, I can't find an useful command to do so. Everything I've tried throws me some error.
Finally, the auxiliary _f columns should be deleted. If anyone knows a way to do it without creating auxiliar columns it would be much better.
This is the expected result (prior to deletion of auxiliary columns):
Name x y h 120Hz 800Hz 1000Hz 1200Hz 120Hz_f 800Hz_f 1000Hz_f 1200Hz_f Range
1: Tower1 1354 829 245 0 8 7 0 0 750 850 0 850
2: Tower2 2654 234 285 7 0 3 0 0 0 850 0 850
3: Tower3 822 3040 256 0 4 0 9 0 750 0 950 950
4: Tower4 987 2747 250 0 6 5 3 0 750 850 950 950
5: Tower5 1953 1739 301 0 0 8 2 0 0 850 950 950
NOTE: The reason why there could be frequency columns that don't appear in assoc is because the original data could have typos. In this example, the column 120Hz would always generate only zeros in column 120Hz_f and thus it can never get to be considered for the maximum Range. That's ok.
A back and forth to long format can make this work:
dcast(melt(DT, measure.vars=patterns("Hz$"))[assoc, on = c(variable = 'Frq')
, Rng := i.Rng * (value != 0)],
Name + x + y + h ~ variable, max, value.var='Rng')[,
do.call(function(...) pmax(..., na.rm = T), .SD), .SDcols = `120Hz`:`1200Hz`]
#[1] 850 850 950 950 950
Or you can avoid creating the intermediate columns if you loop over assoc:
DT[, Range := -Inf]
assoc[, {DT[, Range := pmax(Range, (get(Frq) != 0) * Rng)]; NULL}, by = Frq]
DT
# Name x y h 120Hz 800Hz 1000Hz 1200Hz Range
#1: Tower1 1354 829 245 0 8 7 0 850
#2: Tower2 2654 234 285 7 0 3 0 850
#3: Tower3 822 3040 256 0 4 0 9 950
#4: Tower4 987 2747 250 0 6 5 3 950
#5: Tower5 1953 1739 301 0 0 8 2 950
It is not exactly as you intend but my motto is when the algorithm does not fit the data, then format the data to the algorithm.
A bit long but simple to implement.
I melt DT with the following code and use the convert the Hz into numeric with removing the "Hz" and converting into numeric.
a <- melt(DT,id.vars=1:4)[value>0][,crit:=as.numeric(gsub("Hz","",variable))]
to get something like:
##> a
## Name x y h variable value crit
## 1: Tower1 1354 829 245 800Hz 8 800
## 2: Tower1 1354 829 245 1000Hz 7 1000
## 3: Tower2 2654 234 285 120Hz 7 120
## 4: Tower2 2654 234 285 1000Hz 3 1000
## 5: Tower3 822 3040 256 800Hz 4 800
## 6: Tower3 822 3040 256 1200Hz 9 1200
## 7: Tower4 987 2747 250 800Hz 6 800
## 8: Tower4 987 2747 250 1000Hz 5 1000
## 9: Tower4 987 2747 250 1200Hz 3 1200
## 10: Tower5 1953 1739 301 1000Hz 8 1000
## 11: Tower5 1953 1739 301 1200Hz 2 1200
Then find the max by Tower.
## > a[,.(crit=max(crit)),by=Name]
## Name crit
## 1: Tower1 1000
## 2: Tower2 1000
## 3: Tower3 1200
## 4: Tower4 1200
## 5: Tower5 1200
Then merge it back with a
b <- merge(setkey(a,Name,crit),setkey(a[,.(crit=max(crit)),by=Name],Name,crit))
To get something like
## > b
## Name crit x y h variable value
## 1: Tower1 1000 1354 829 245 1000Hz 7
## 2: Tower2 1000 2654 234 285 1000Hz 3
## 3: Tower3 1200 822 3040 256 1200Hz 9
## 4: Tower4 1200 987 2747 250 1200Hz 3
## 5: Tower5 1200 1953 1739 301 1200Hz 2
Then merge b with assoc
## > merge(b,assoc,by.x="variable",by.y="Frq")
## variable Name crit x y h value Rng
## 1: 1000Hz Tower1 1000 1354 829 245 7 850
## 2: 1000Hz Tower2 1000 2654 234 285 3 850
## 3: 1200Hz Tower3 1200 822 3040 256 9 950
## 4: 1200Hz Tower4 1200 987 2747 250 3 950
## 5: 1200Hz Tower5 1200 1953 1739 301 2 950

Checking the value from given threshold in a set of observation and continue till end of vector

Task:
I have to check that if the value in the data vector is above from the given threshold,
If in my data vector, I found 5 consecutive values greater then the given threshold then I keep these values as they are.
If I have less then 5 values (not 5 consecutive values) then I will replace these values with NA's.
The sample data and required output is shown below. In this example the threshold value is 1000. X is input data variable and the desired output is: Y = X(Threshold > 1000)
X Y
580 580
457 457
980 980
1250 NA
3600 NA
598 598
1200 1200
1345 1345
9658 9658
1253 1253
4500 4500
1150 1150
596 596
594 594
550 550
1450 NA
320 320
1780 NA
592 592
590 590
I have used the following code in R for my desired output but unable to get the appropriate one:
for (i in 1:nrow(X)) # X is my data vector
{counter=0
if (X[i]>10000)
{
for (j in i:(i+4))
{
if (X[j]>10000)
{counter=counter+1}
}
ifelse (counter < 5, NA, X[j])
}
X[i]<- NA
}
X
I am sure that the above code contain some error. I need help in the form of either a new code or modifying this code or any package in R.
Here is an approach using dplyr, using a cumulative sum of diff(x > 1000) to group the values.
library(dplyr)
df <- data.frame(x)
df
# x
# 1 580
# 2 457
# 3 980
# 4 1250
# 5 3600
# 6 598
# 7 1200
# 8 1345
# 9 9658
# 10 1253
# 11 4500
# 12 1150
# 13 596
# 14 594
# 15 550
# 16 1450
# 17 320
# 18 1780
# 19 592
# 20 590
df %>% mutate(group = cumsum(c(0, abs(diff(x>1000))))) %>%
group_by(group) %>%
mutate(count = n()) %>%
ungroup() %>%
mutate(y = ifelse(x<1000 | count > 5, x, NA))
# x group count y
# (int) (dbl) (int) (int)
# 1 580 0 3 580
# 2 457 0 3 457
# 3 980 0 3 980
# 4 1250 1 2 NA
# 5 3600 1 2 NA
# 6 598 2 1 598
# 7 1200 3 6 1200
# 8 1345 3 6 1345
# 9 9658 3 6 9658
# 10 1253 3 6 1253
# 11 4500 3 6 4500
# 12 1150 3 6 1150
# 13 596 4 3 596
# 14 594 4 3 594
# 15 550 4 3 550
# 16 1450 5 1 NA
# 17 320 6 1 320
# 18 1780 7 1 NA
# 19 592 8 2 592
# 20 590 8 2 590
Another approach :
Y<-rep(NA,nrow(X))
for (i in 1:nrow(X)) {
if (X[i,1]<1000) {Y[i]<-X[i,1]} else if (sum(X[i:min((i+4),nrow(X)),1]>1000)>=5) {
Y[i:min((i+4),nrow(X))]<-X[i:min((i+4),nrow(X)),1]}
}
returns
> Y
[1] 580 457 980 NA NA 598 1200 1345 9658 1253 4500 1150 596 594 550 NA 320 NA 592 590
This assumes that the values of X are in the first column of a dataframe named X.
It then creates Y with NAand only change the values if the criteria are met.

Resources