Order by letters and numbers - r

I have a DF$vector which looks like this:
A10 A50
C1 C4
B1
A7
C3
B1 B4
I look for a way to order it as follows:
A10 A50
A7
B1 B4
B1
C1 C4
C3
I tried to use gsub :
vector[order(gsub("([A-Z]+)([0-9]+)", "\\1", vector),
as.numeric(gsub("([A-Z]+)([0-9]+)", "\\2", vector)))]
But it didnt return what i want.
Thank you for any suggestions.

We can use order from base R
df1[order(sub("\\d+", "", df1[,1]), as.numeric(sub("\\D+", "", df1[,1])), df1[,2] == ""),]
# A10 A50
#3 A7
#5 B1 B4
#2 B1
#1 C1 C4
#4 C3
data
df1 <-structure(list(A10 = c("C1", "B1", "A7", "C3", "B1"), A50 = c("C4",
"", "", "", "B4")), .Names = c("A10", "A50"), class = "data.frame",
row.names = c(NA, -5L))

In programming languages, the letters are considered to be increasing in terms of magnitude. Thus A is considered to be lessthan Betc. Thus to order the above, just use the code:
df1$r=rank(df1$A10,ties.method = "last")
df1[order(df1$r),-ncol(df1)]
A10 A50
3 A7
5 B1 B4
2 B1
1 C1 C4
4 C3

Related

in R4.1.2 How to remove duplicate cells in a row leaving only the first cell

How to remove a repeated duplicate cell in row, leaving only the first cell.
(Remove the 2nd A3)
V1 V2 V3
A1 NA C1
A2 NA C2
A3 A3 C3
A4 NA C4
A5 NA C5
A6 NA C6
A7 NA C7
A8 NA C8
my target
V1 V2 V3
A1 NA C1
A2 NA C2
A3 NA C3
A4 NA C4
A5 NA C5
A6 NA C6
A7 NA C7
A8 NA C8
for(x in nrow(dataset))
{
if(dataset[x,2]%in%dataset[ ,1])
dataset[x, 2]<-NA
}
Something like this I guess
It should work even if the A3 is not in the same row as the A3 in the first column
If you need the target is to remove the same values only in the same row then replace if statement by
if(dataset[x,2]==dataset[x ,1])
A possible solution:
library(tidyverse)
df <- data.frame(
stringsAsFactors = FALSE,
V1 = c("A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8"),
V2 = c(NA, NA, "A3", NA, NA, NA, NA, NA),
V3 = c("C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8")
)
df %>%
mutate(V2 = if_else(V1 == V2, NA_character_, V2))
#> V1 V2 V3
#> 1 A1 <NA> C1
#> 2 A2 <NA> C2
#> 3 A3 <NA> C3
#> 4 A4 <NA> C4
#> 5 A5 <NA> C5
#> 6 A6 <NA> C6
#> 7 A7 <NA> C7
#> 8 A8 <NA> C8
Using replace.
transform(df, V2=replace(V2, V2 %in% V1, NA))
# V1 V2 V3
# 1 A1 <NA> C1
# 2 A2 <NA> C2
# 3 A3 <NA> C3
# 4 A4 <NA> C4
# 5 A5 <NA> C5
# 6 A6 <NA> C6
# 7 A7 <NA> C7
# 8 A8 <NA> C8
Or %in% in Reduce.
df[Reduce(`%in%`, df[1:2]), 'V2'] <- NA
df
# V1 V2 V3
# 1 A1 <NA> C1
# 2 A2 <NA> C2
# 3 A3 <NA> C3
# 4 A4 <NA> C4
# 5 A5 <NA> C5
# 6 A6 <NA> C6
# 7 A7 <NA> C7
# 8 A8 <NA> C8
Data:
df <- structure(list(V1 = c("A1", "A2", "A3", "A4", "A5", "A6", "A7",
"A8"), V2 = c(NA, NA, "A3", NA, NA, NA, NA, NA), V3 = c("C1",
"C2", "C3", "C4", "C5", "C6", "C7", "C8")), row.names = c(NA,
-8L), class = "data.frame")

Make indexing of values in column

I have a dataframe:
dput(df)
structure(list(ID = c("A1", "A1", "A1", "A1", "A1", "A1", "B2",
"B2", "B2", "B2", "B2", "B2", "B2", "B2", "B2", "B2"), operation = c("open",
"open", "close", "", "open", "close", "", "open", "open", "open",
"close", "upload", "open", "close", "open", "close")), class = "data.frame", row.names = c(NA,
-16L))
ID operation
A1 open
A1 open
A1 close
A1
A1 open
A1 close
B2
B2 open
B2 open
B2 open
B2 close
B2 upload
B2 open
B2 close
B2 open
B2 close
I want to add index for each bundle of "open" and "close" in column operation. So for each row between open and close must have same index. So desired result is:
ID operation index
A1 open 1
A1 open 1
A1 close 1
A1
A1 open 2
A1 close 2
B2
B2 open 3
B2 open 3
B2 open 3
B2 close 3
B2 upload
B2 open 4
B2 close 4
B2 open 5
B2 close 5
I do it like this:
dt[, index := .GRP, by = .(rev(cumsum(rev(operation) == 'close')))]
dt[, index := ifelse(cumsum(operation == 'open') > 0, index, NA), by = .(ID, index)]
However I want there be two options for "close". It can be "close" or it can be "checking":
ID operation
A1 open
A1 open
A1 checking
A1
A1 open
A1 close
B2
B2 open
B2 open
B2 open
B2 close
B2 upload
B2 open
B2 close
B2 open
B2 close
and i want to get:
ID operation index
A1 open 1
A1 open 1
A1 checking 1
A1
A1 open 2
A1 close 2
B2
B2 open 3
B2 open 3
B2 open 3
B2 close 3
B2 upload
B2 open 4
B2 close 4
B2 open 5
B2 close 5
How could I add this or option?
You could use %in% to check for multiple values.
library(data.table)
setDT(dt)
dt[, index := .GRP, by = .(rev(cumsum(rev(operation) %in% c('close', 'checking'))))]
dt[, index := ifelse(cumsum(operation == 'open') > 0, index, NA), by = .(ID, index)]

Create a data frame where corresponding values are t interval back

I have a huge data frame with following syntax (the four variables are just for example, there are many more variables):
Date. Ticker. Revenue. Price.
a1 b1 c1 d1
a2 b1 c2 d2
a3 b1 c3 d3
a4 b1 c4 d4
a5 b1 c5 d5
a1 b2 c6 d6
a2 b2 c7 d7
a3 b2 c8 d8
a4 b2 c9 d9
a5 b2 c10 d10
...
The ticker b1 and b2 are in order in the example, but in the real df it might be mixed up.
What I want is to create a new data frame with prices that goes to t intervals back. For example, if I need 3 years back, the result will be:
Date. Ticker. Revenue. Price.
a1 b1 c1
a2 b1 c2
a3 b1 c3
a4 b1 c4 d1
a5 b1 c5 d2
a1 b2 c6
a2 b2 c7
a3 b2 c8
a4 b2 c9 d6
a5 b2 c10 d10
...
We can use lag in dplyr to go back t intervals.
library(dplyr)
t <- 3
df %>% group_by(Ticker) %>% mutate(Price= lag(Price, t))
# Date Ticker Revenue Price
# <chr> <chr> <chr> <chr>
# 1 a1 b1 c1 NA
# 2 a2 b1 c2 NA
# 3 a3 b1 c3 NA
# 4 a4 b1 c4 d1
# 5 a5 b1 c5 d2
# 6 a1 b2 c6 NA
# 7 a2 b2 c7 NA
# 8 a3 b2 c8 NA
# 9 a4 b2 c9 d6
#10 a5 b2 c10 d7
Or shift in data.table :
library(data.table)
setDT(df)[, Price := shift(Price, t), Ticker]
data
df <- structure(list(Date = c("a1", "a2", "a3", "a4", "a5", "a1", "a2",
"a3", "a4", "a5"), Ticker = c("b1", "b1", "b1", "b1", "b1", "b2",
"b2", "b2", "b2", "b2"), Revenue = c("c1", "c2", "c3", "c4",
"c5", "c6", "c7", "c8", "c9", "c10"), Price = c("d1", "d2", "d3",
"d4", "d5", "d6", "d7", "d8", "d9", "d10")),
class = "data.frame", row.names = c(NA, -10L))
We can use data.table methods
library(data.table)
setDT(df)[, Price. := shift(Price., 3, fill = ""), Ticker.]
or with dplyr
library(dplyr)
df %>%
group_by(Ticker.) %>%
mutate(Price = lag(Price., 3, default = ""))
-output
# A tibble: 10 x 5
# Groups: Ticker. [2]
# Date. Ticker. Revenue. Price. Price
# <chr> <chr> <chr> <chr> <chr>
# 1 a1 b1 c1 d1 ""
# 2 a2 b1 c2 d2 ""
# 3 a3 b1 c3 d3 ""
# 4 a4 b1 c4 d4 "d1"
# 5 a5 b1 c5 d5 "d2"
# 6 a1 b2 c6 d6 ""
# 7 a2 b2 c7 d7 ""
# 8 a3 b2 c8 d8 ""
# 9 a4 b2 c9 d9 "d6"
#10 a5 b2 c10 d10 "d7"
Or using base R with ave
df$Price <- with(df, ave(Price., Ticker., FUN =
function(x) c(rep('', 3), head(x, -3))))
data
df <- structure(list(Date. = c("a1", "a2", "a3", "a4", "a5", "a1",
"a2", "a3", "a4", "a5"), Ticker. = c("b1", "b1", "b1", "b1",
"b1", "b2", "b2", "b2", "b2", "b2"), Revenue. = c("c1", "c2",
"c3", "c4", "c5", "c6", "c7", "c8", "c9", "c10"), Price. = c("d1",
"d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10")), class = "data.frame",
row.names = c(NA,
-10L))

R: Merge Data While Retaining Values for One Dataset in Duplicates

I have two data sets, data1 and data2:
data1 <- data.frame(ID = 1:6,
A = c("a1", "a2", NA, "a4", "a5", NA),
B = c("b1", "b2", "b3", NA, "b5", NA),
stringsAsFactors = FALSE)
data1
ID A B
1 a1 b1
2 a2 b2
3 NA b3
4 a4 NA
5 a5 b5
6 NA NA
and
data2 <- data.frame(ID = 1:6,
A = c(NA, "a2", "a3", NA, "a5", "a6"),
B = c(NA, "b2.wrong", NA, "b4", "b5", "b6"),
stringsAsFactors = FALSE)
data2
ID A B
1 NA NA
2 a2 b2.wrong
3 a3 NA
4 NA b4
5 a5 b5
6 a6 b6
I would like to merge them by ID so that the resultant merged dataset, data.merged, populates fields form both datasets, but chooses values from data1 whenever there are possible values from both datasets.
I.e., I would like the final dataset, data.merge, to be:
ID A B
1 a1 b1
2 a2 b2
3 a3 b3
4 a4 b4
5 a5 b5
6 a6 b6
I have looked around, finding similar but not exact answers.
You can join the data and use coalesce to select the first non-NA value.
library(dplyr)
data1 %>%
inner_join(data2, by = 'ID') %>%
mutate(A = coalesce(A.x, A.y),
B = coalesce(B.x, B.y)) %>%
select(names(data1))
# ID A B
#1 1 a1 b1
#2 2 a2 b2
#3 3 a3 b3
#4 4 a4 b4
#5 5 a5 b5
#6 6 a6 b6
Or in base R comparing values with NA :
transform(merge(data1, data2, by = 'ID'),
A = ifelse(is.na(A.x), A.y, A.x),
B = ifelse(is.na(B.x), B.y, B.x))[names(data1)]

Wrap new row based on condition from column [duplicate]

This question already has answers here:
R semicolon delimited a column into rows
(3 answers)
Closed 6 years ago.
I have a dataset that has following format:
id1 a1 b2 x1;x2;x3
id2 a2 b3 x4;x5
id3 a4 b5 x6
id4 a7 b7 x7;x8
First 3 columns (id, a, b) only have 1 instance, but the last column has multiple instances, separated by ;. How can I "wrap" these into new columns? Such as:
id1 a1 b2 x1
id1 a1 b2 x2
id1 a1 b2 x3
id2 a2 b3 x4
id2 a2 b3 x5
id3 a4 b5 x6
id4 a7 b7 x7
id4 a7 b7 x8
We can use cSplit
library(splitstackshape)
cSplit(df1, "v4", ";", "long")
# v1 v2 v3 v4
#1: id1 a1 b2 x1
#2: id1 a1 b2 x2
#3: id1 a1 b2 x3
#4: id2 a2 b3 x4
#5: id2 a2 b3 x5
#6: id3 a4 b5 x6
#7: id4 a7 b7 x7
#8: id4 a7 b7 x8
data
df1 <- structure(list(v1 = c("id1", "id2", "id3", "id4"), v2 = c("a1",
"a2", "a4", "a7"), v3 = c("b2", "b3", "b5", "b7"), v4 = c("x1;x2;x3",
"x4;x5", "x6", "x7;x8")), .Names = c("v1", "v2", "v3", "v4"),
class = "data.frame", row.names = c(NA, -4L))

Resources