Make indexing of values in column - r

I have a dataframe:
dput(df)
structure(list(ID = c("A1", "A1", "A1", "A1", "A1", "A1", "B2",
"B2", "B2", "B2", "B2", "B2", "B2", "B2", "B2", "B2"), operation = c("open",
"open", "close", "", "open", "close", "", "open", "open", "open",
"close", "upload", "open", "close", "open", "close")), class = "data.frame", row.names = c(NA,
-16L))
ID operation
A1 open
A1 open
A1 close
A1
A1 open
A1 close
B2
B2 open
B2 open
B2 open
B2 close
B2 upload
B2 open
B2 close
B2 open
B2 close
I want to add index for each bundle of "open" and "close" in column operation. So for each row between open and close must have same index. So desired result is:
ID operation index
A1 open 1
A1 open 1
A1 close 1
A1
A1 open 2
A1 close 2
B2
B2 open 3
B2 open 3
B2 open 3
B2 close 3
B2 upload
B2 open 4
B2 close 4
B2 open 5
B2 close 5
I do it like this:
dt[, index := .GRP, by = .(rev(cumsum(rev(operation) == 'close')))]
dt[, index := ifelse(cumsum(operation == 'open') > 0, index, NA), by = .(ID, index)]
However I want there be two options for "close". It can be "close" or it can be "checking":
ID operation
A1 open
A1 open
A1 checking
A1
A1 open
A1 close
B2
B2 open
B2 open
B2 open
B2 close
B2 upload
B2 open
B2 close
B2 open
B2 close
and i want to get:
ID operation index
A1 open 1
A1 open 1
A1 checking 1
A1
A1 open 2
A1 close 2
B2
B2 open 3
B2 open 3
B2 open 3
B2 close 3
B2 upload
B2 open 4
B2 close 4
B2 open 5
B2 close 5
How could I add this or option?

You could use %in% to check for multiple values.
library(data.table)
setDT(dt)
dt[, index := .GRP, by = .(rev(cumsum(rev(operation) %in% c('close', 'checking'))))]
dt[, index := ifelse(cumsum(operation == 'open') > 0, index, NA), by = .(ID, index)]

Related

Create a data frame where corresponding values are t interval back

I have a huge data frame with following syntax (the four variables are just for example, there are many more variables):
Date. Ticker. Revenue. Price.
a1 b1 c1 d1
a2 b1 c2 d2
a3 b1 c3 d3
a4 b1 c4 d4
a5 b1 c5 d5
a1 b2 c6 d6
a2 b2 c7 d7
a3 b2 c8 d8
a4 b2 c9 d9
a5 b2 c10 d10
...
The ticker b1 and b2 are in order in the example, but in the real df it might be mixed up.
What I want is to create a new data frame with prices that goes to t intervals back. For example, if I need 3 years back, the result will be:
Date. Ticker. Revenue. Price.
a1 b1 c1
a2 b1 c2
a3 b1 c3
a4 b1 c4 d1
a5 b1 c5 d2
a1 b2 c6
a2 b2 c7
a3 b2 c8
a4 b2 c9 d6
a5 b2 c10 d10
...
We can use lag in dplyr to go back t intervals.
library(dplyr)
t <- 3
df %>% group_by(Ticker) %>% mutate(Price= lag(Price, t))
# Date Ticker Revenue Price
# <chr> <chr> <chr> <chr>
# 1 a1 b1 c1 NA
# 2 a2 b1 c2 NA
# 3 a3 b1 c3 NA
# 4 a4 b1 c4 d1
# 5 a5 b1 c5 d2
# 6 a1 b2 c6 NA
# 7 a2 b2 c7 NA
# 8 a3 b2 c8 NA
# 9 a4 b2 c9 d6
#10 a5 b2 c10 d7
Or shift in data.table :
library(data.table)
setDT(df)[, Price := shift(Price, t), Ticker]
data
df <- structure(list(Date = c("a1", "a2", "a3", "a4", "a5", "a1", "a2",
"a3", "a4", "a5"), Ticker = c("b1", "b1", "b1", "b1", "b1", "b2",
"b2", "b2", "b2", "b2"), Revenue = c("c1", "c2", "c3", "c4",
"c5", "c6", "c7", "c8", "c9", "c10"), Price = c("d1", "d2", "d3",
"d4", "d5", "d6", "d7", "d8", "d9", "d10")),
class = "data.frame", row.names = c(NA, -10L))
We can use data.table methods
library(data.table)
setDT(df)[, Price. := shift(Price., 3, fill = ""), Ticker.]
or with dplyr
library(dplyr)
df %>%
group_by(Ticker.) %>%
mutate(Price = lag(Price., 3, default = ""))
-output
# A tibble: 10 x 5
# Groups: Ticker. [2]
# Date. Ticker. Revenue. Price. Price
# <chr> <chr> <chr> <chr> <chr>
# 1 a1 b1 c1 d1 ""
# 2 a2 b1 c2 d2 ""
# 3 a3 b1 c3 d3 ""
# 4 a4 b1 c4 d4 "d1"
# 5 a5 b1 c5 d5 "d2"
# 6 a1 b2 c6 d6 ""
# 7 a2 b2 c7 d7 ""
# 8 a3 b2 c8 d8 ""
# 9 a4 b2 c9 d9 "d6"
#10 a5 b2 c10 d10 "d7"
Or using base R with ave
df$Price <- with(df, ave(Price., Ticker., FUN =
function(x) c(rep('', 3), head(x, -3))))
data
df <- structure(list(Date. = c("a1", "a2", "a3", "a4", "a5", "a1",
"a2", "a3", "a4", "a5"), Ticker. = c("b1", "b1", "b1", "b1",
"b1", "b2", "b2", "b2", "b2", "b2"), Revenue. = c("c1", "c2",
"c3", "c4", "c5", "c6", "c7", "c8", "c9", "c10"), Price. = c("d1",
"d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10")), class = "data.frame",
row.names = c(NA,
-10L))

R: Merge Data While Retaining Values for One Dataset in Duplicates

I have two data sets, data1 and data2:
data1 <- data.frame(ID = 1:6,
A = c("a1", "a2", NA, "a4", "a5", NA),
B = c("b1", "b2", "b3", NA, "b5", NA),
stringsAsFactors = FALSE)
data1
ID A B
1 a1 b1
2 a2 b2
3 NA b3
4 a4 NA
5 a5 b5
6 NA NA
and
data2 <- data.frame(ID = 1:6,
A = c(NA, "a2", "a3", NA, "a5", "a6"),
B = c(NA, "b2.wrong", NA, "b4", "b5", "b6"),
stringsAsFactors = FALSE)
data2
ID A B
1 NA NA
2 a2 b2.wrong
3 a3 NA
4 NA b4
5 a5 b5
6 a6 b6
I would like to merge them by ID so that the resultant merged dataset, data.merged, populates fields form both datasets, but chooses values from data1 whenever there are possible values from both datasets.
I.e., I would like the final dataset, data.merge, to be:
ID A B
1 a1 b1
2 a2 b2
3 a3 b3
4 a4 b4
5 a5 b5
6 a6 b6
I have looked around, finding similar but not exact answers.
You can join the data and use coalesce to select the first non-NA value.
library(dplyr)
data1 %>%
inner_join(data2, by = 'ID') %>%
mutate(A = coalesce(A.x, A.y),
B = coalesce(B.x, B.y)) %>%
select(names(data1))
# ID A B
#1 1 a1 b1
#2 2 a2 b2
#3 3 a3 b3
#4 4 a4 b4
#5 5 a5 b5
#6 6 a6 b6
Or in base R comparing values with NA :
transform(merge(data1, data2, by = 'ID'),
A = ifelse(is.na(A.x), A.y, A.x),
B = ifelse(is.na(B.x), B.y, B.x))[names(data1)]

how to create new variables from one variable using two rules

I would appreciate any help to create new variables from one variable.
Specifically, I need help to simultaneously create one row per each ID and various columns of E, where each of the new columns of E, (that is, E1, E2, E3) contains the values of E for each row of ID. I tried doing this which melt followed by spread but I am getting the error:
Error: Duplicate identifiers for rows (4, 7, 9), (1, 3, 6), (2, 5, 8)
Additionally, I tried the solutions discussed here and here but these did not work for my case because I need to be able to create row identifiers for rows (4, 1, 2), (7, 3, 5), and (9, 6, 8). That is, E for rows (4, 1, 2) should be named E1, E for rows (7, 3, 5) should be named E2, E for rows (9, 6, 8) should be named E3, and so on.
#data
dT<-structure(list(A = c("a1", "a2", "a1", "a1", "a2", "a1", "a1",
"a2", "a1"), B = c("b2", "b2", "b2", "b1", "b2", "b2", "b1",
"b2", "b1"), ID = c("3", "4", "3", "1", "4", "3", "1", "4", "1"
), E = c(0.621142094943352, 0.742109450696123, 0.39439152996948,
0.40694392882818, 0.779607277916503, 0.550579323666347, 0.352622183880119,
0.690660491345867, 0.23378944873769)), class = c("data.table",
"data.frame"), row.names = c(NA, -9L))
#my attempt
A B ID E
1: a1 b2 3 0.6211421
2: a2 b2 4 0.7421095
3: a1 b2 3 0.3943915
4: a1 b1 1 0.4069439
5: a2 b2 4 0.7796073
6: a1 b2 3 0.5505793
7: a1 b1 1 0.3526222
8: a2 b2 4 0.6906605
9: a1 b1 1 0.2337894
aTempDF <- melt(dT, id.vars = c("A", "B", "ID")) )
A B ID variable value
1: a1 b2 3 E 0.6211421
2: a2 b2 4 E 0.7421095
3: a1 b2 3 E 0.3943915
4: a1 b1 1 E 0.4069439
5: a2 b2 4 E 0.7796073
6: a1 b2 3 E 0.5505793
7: a1 b1 1 E 0.3526222
8: a2 b2 4 E 0.6906605
9: a1 b1 1 E 0.2337894
aTempDF%>%spread(variable, value)
Error: Duplicate identifiers for rows (4, 7, 9), (1, 3, 6), (2, 5, 8)
#expected output
A B ID E1 E2 E3
1: a1 b2 3 0.6211421 0.3943915 0.5505793
2: a2 b2 4 0.7421095 0.7796073 0.6906605
3: a1 b1 1 0.4069439 0.3526222 0.2337894
Thanks in advance for any help.
You can use dcast from data.table
library(data.table)
dcast(dT, A + B + ID ~ paste0("E", rowid(ID)))
# A B ID E1 E2 E3
#1 a1 b1 1 0.4069439 0.3526222 0.2337894
#2 a1 b2 3 0.6211421 0.3943915 0.5505793
#3 a2 b2 4 0.7421095 0.7796073 0.6906605
You need to create the correct 'time variable' first which is what rowid(ID) does.
For those looking for a tidyverse solution:
library(tidyverse)
dT <- structure(
list(
A = c("a1", "a2", "a1", "a1", "a2", "a1", "a1", "a2", "a1"),
B = c("b2", "b2", "b2", "b1", "b2", "b2", "b1", "b2", "b1"),
ID = c("3", "4", "3", "1", "4", "3", "1", "4", "1"),
E = c(0.621142094943352, 0.742109450696123, 0.39439152996948, 0.40694392882818,
0.550579323666347, 0.352622183880119, 0.690660491345867, 0.23378944873769,
0.779607277916503)),
class = c("data.table",
"data.frame"),
row.names = c(NA, -9L))
dT %>%
as_tibble() %>% # since dataset is a data.table object
group_by(A, B, ID) %>%
# Just so columns are "E1", "E2", etc.
mutate(rn = glue::glue("E{row_number()}")) %>%
ungroup() %>%
spread(rn, E) %>%
# not necessary, just making output in the same order as your expected output
arrange(desc(B))
# A tibble: 3 x 6
# A B ID E1 E2 E3
# <chr> <chr> <chr> <dbl> <dbl> <dbl>
#1 a1 b2 3 0.621 0.394 0.551
#2 a2 b2 4 0.742 0.780 0.691
#3 a1 b1 1 0.407 0.353 0.234
As mentioned in the accepted answer, you need a "key" variable to spread on first. This is created using row_number() and glue where glue just gives you the proper E1, E2, etc. variable names.
The group_by piece just makes sure that the row numbers are with respect to A, B and ID.
EDIT for tidyr >= 1.0.0
The (not-so) new pivot_ functions supercede gather and spread and eliminate the need to glue the new variable names together in a mutate.
dT %>%
as_tibble() %>% # since dataset is a data.table object
group_by(A, B, ID) %>%
# no longer need to glue (or paste) the names together but still need a row number
mutate(rn = row_number()) %>%
ungroup() %>%
pivot_wider(names_from = rn, values_from = E, names_glue = "E{.name}") %>% # names_glue argument allows for easy transforming of the new variable names
# not necessary, just making output in the same order as your expected output
arrange(desc(B))
# A tibble: 3 x 6
# A B ID E1 E2 E3
# <chr> <chr> <chr> <dbl> <dbl> <dbl>
#1 a1 b2 3 0.621 0.394 0.551
#2 a2 b2 4 0.742 0.780 0.691
#3 a1 b1 1 0.407 0.353 0.234

transpose/reshape with customised names

To create the sample needed:
require(pacman)
p_load(data.table)
DT_start <- data.table(ID = c(1,1,1,2,2,2), valueA = c("a1","a2","a3","b1","b2","b3"), valueB = c("A1","A2","A3","B1","B2","B3"))
DT_end <- data.table(ID = c(1,2)
, T01_valueA = c("a1","b1")
, T02_valueA = c("a2","b2")
, T03_valueA = c("a3","b3")
, T01_valueB = c("A1","B1")
, T02_valueB = c("A2","B2")
, T03_valueB = c("A3","B3"))
setcolorder(DT_end, c("ID","T01_valueA","T01_valueB","T02_valueA","T02_valueB","T03_valueA","T03_valueB"))
I have:
> DT_start
ID valueA valueB
1: 1 a1 A1
2: 1 a2 A2
3: 1 a3 A3
4: 2 b1 B1
5: 2 b2 B2
6: 2 b3 B3
I need:
> DT_end
ID T01_valueA T01_valueB T02_valueA T02_valueB T03_valueA T03_valueB
1: 1 a1 A1 a2 A2 a3 A3
2: 2 b1 B1 b2 B2 b3 B3
How to achieve it? basically transpose DT_start to DT_end with customized names: T01, T02, T03...
Using the input DT in the Note at the end we create a sequence within ID column s, melt it to long form and then dcast it back to the desired wide form. (The dcast formula could alternately be written as ID ~ s + variable.)
library(data.table)
DT[, s := sprintf("T%02d", seq_along(.I)), ID]
m <- melt(DT, id.vars = c("ID", "s"))
dcast(m, ID ~ ...)
giving:
ID T01_valueA T01_valueB T02_valueA T02_valueB T03_valueA T03_valueB
1: 1 a1 A1 a2 A2 a3 A3
2: 2 b1 B1 b2 B2 b3 B3
Note:
Input used:
library(data.table)
DF <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 2L), valueA = c("a1",
"a2", "a3", "b1", "b2", "b3"), valueB = c("A1", "A2", "A3", "B1",
"B2", "B3")), class = "data.frame",
row.names = c(NA, -6L))
DT <- as.data.table(DF)

Order by letters and numbers

I have a DF$vector which looks like this:
A10 A50
C1 C4
B1
A7
C3
B1 B4
I look for a way to order it as follows:
A10 A50
A7
B1 B4
B1
C1 C4
C3
I tried to use gsub :
vector[order(gsub("([A-Z]+)([0-9]+)", "\\1", vector),
as.numeric(gsub("([A-Z]+)([0-9]+)", "\\2", vector)))]
But it didnt return what i want.
Thank you for any suggestions.
We can use order from base R
df1[order(sub("\\d+", "", df1[,1]), as.numeric(sub("\\D+", "", df1[,1])), df1[,2] == ""),]
# A10 A50
#3 A7
#5 B1 B4
#2 B1
#1 C1 C4
#4 C3
data
df1 <-structure(list(A10 = c("C1", "B1", "A7", "C3", "B1"), A50 = c("C4",
"", "", "", "B4")), .Names = c("A10", "A50"), class = "data.frame",
row.names = c(NA, -5L))
In programming languages, the letters are considered to be increasing in terms of magnitude. Thus A is considered to be lessthan Betc. Thus to order the above, just use the code:
df1$r=rank(df1$A10,ties.method = "last")
df1[order(df1$r),-ncol(df1)]
A10 A50
3 A7
5 B1 B4
2 B1
1 C1 C4
4 C3

Resources