Subset data for the most recent month - r

I have a .txt file
test.txt
V1 V2 Date
A B 2020-01-02
C D 2020-02-27
E F 2020-09-10
G H 2020-09-15
I want to subset data based on the most recent month.
I did this which does the job but I want to extract the most recent month automatically rather than typing in manually and then extract the data
test$month <- factor(format(test$Date, "%B"),levels = month.name)
test.subset <- test[test$month == "September"]

We can arrange the Date class column and filter the formated value by comparing it with the last one
library(dplyr)
test %>%
mutate(Date = as.Date(Date), Month = format(Date, '%B')) %>%
arrange(Date) %>%
filter(Month == last(Month)) %>%
select(-Month)
-output
# V1 V2 Date
#1 E F 2020-09-10
#2 G H 2020-09-15
data
test <- structure(list(V1 = c("A", "C", "E", "G"), V2 = c("B", "D", "F",
"H"), Date = c("2020-01-02", "2020-02-27", "2020-09-10", "2020-09-15"
)), class = "data.frame", row.names = c(NA, -4L))

Here is a base R option using subset + gsub
subset(
transform(
df,
ym = gsub("\\d+$", "", Date)
),
ym == max(ym),
select = -ym
)
which gives
V1 V2 Date
3 E F 2020-09-10
4 G H 2020-09-15
A data.table option
setDT(df)[
,
`:=`(Year = year(as.IDate(Date)), Month = month(as.Date(Date)))
][
.(max(Year), max(Month)),
on = .(Year, Month)
][
,
`:=`(Year = NULL, Month = NULL)
][]
gives
V1 V2 Date
1: E F 2020-09-10
2: G H 2020-09-15
Data
> dput(df)
structure(list(V1 = c("A", "C", "E", "G"), V2 = c("B", "D", "F",
"H"), Date = c("2020-01-02", "2020-02-27", "2020-09-10", "2020-09-15"
)), class = "data.frame", row.names = c(NA, -4L))

Using the structure shared by #ThomasIsCoding, and assuming the year is constant, one could just look for the row with the max month and filter for that:
# using datatable
library(data.table)
setDT(df)[month(as.IDate(Date)) == max(month(as.IDate(Date)))]
V1 V2 Date
1: E F 2020-09-10
2: G H 2020-09-15

Related

Select row in a group based on priority of column values in data.table

I have a data.table as follows -
temp_dt = structure(list(group = c("A", "A", "B", "C", "D",
"D", "E", "E"), value = c(28.395, 26.206, 64.032,
7.588961, 0.053089, 0.053089, 0.795798, 0.795798), type = c("R",
"P", "R", "R", "R", "P", "R", "P")), row.names = c(NA, -8L), class = c("data.table",
"data.frame"))
> temp_dt
group value type
1: A 28.395000 R
2: A 26.206000 P
3: B 64.032000 R
4: C 7.588961 R
5: D 0.053089 R
6: D 0.053089 P
7: E 0.795798 R
8: E 0.795798 P
I want to subset the data.table temp_dt such that when a group has both types R and P, then row with type R is selected. If the group has either R or P, then whatever is available is selected.
A possible solution:
library(data.table)
temp_dt[order(-type),.SD[1,],by=group]
group value type
<char> <num> <char>
1: A 26.206000 R
2: B 64.032000 R
3: C 7.588961 R
4: D 0.053089 R
5: E 0.795798 R
temp_11 <- table(temp_dt[ ,-2])
temp_11 <- as.data.table(temp_11)
temp_12 <- temp_11 %>%
group_by(group)%>%
summarise(Select = sum(N))
temp_12$Select <- as.character(temp_12$Select)
for (i in 1:nrow(temp_12)){
if((temp_12[i , 2] == 2)){
temp_12[i , 2] = "R"
}else{
temp_12[i , 2] = "P"
}
}
temp_12
It will give this:
group Select
<chr> <chr>
1 A R
2 B P
3 C P
4 D R
5 E R

Transposing a single row (top row) within a dataset to its own column

I have the following dataset
df <- structure(list(
X1 = c("A", "B", "C", "D"),
X2 = c("NA", "B", "C", "D"),
X3 = c("NA", "B", "C", "D"),
X4 = c("NA", "B", "C", "D")),
class = "data.frame", row.names = c(NA, -4L))
I need to transpose the top row into its own column such that the data looks like
df <- structure(list(
X1 = c("A", "A", "A"),
X2 = c("B", "C", "D"),
X3 = c("B", "C", "D"),
X4 = c("B", "C", "D")),
class = "data.frame", row.names = c(NA, -3L))
I have thought about just subsetting and taking the top row, then transposing only that one and then merging it back to the original dataset.
I am wondering if there is a more elegant solution.
Edit, Thanks for everyone's help.
The next step is to take this and apply it to a list of tibbles that were split via group_split (code thanks to #LMc).
data %>%
group_by(split_on = cumsum(is.na(Company) & is.na(lag(Company)))) %>%
group_split(.keep = F) %>%
`names<-`({.} %>%
map(~ .[1,1])%>%
unlist())
"[<-"("["(df, -1, ), ,1,df[1,1])
:-)
df[,1] <- df[1,1]
df[-1,]
X1 X2 X3 X4
2 A B B B
3 A C C C
4 A D D D
We could use
library(dplyr)
df %>%
mutate(X1 = first(X1)) %>%
slice(-1)
# X1 X2 X3 X4
#1 A B B B
#2 A C C C
#3 A D D D
Or in base R (R 4.1.0)
df |>
transform(X1 = X1[1]) |>
subset( seq_along(X1) > 1)
# X1 X2 X3 X4
#2 A B B B
#3 A C C C
#4 A D D D

Summation of money amounts in character format by group

I have a data frame that contains the monetary transactions among individuals. The transactions can be two-way, i.e. A can transfer money to B and B can also transfer money to A. The structure of the data frame looks like below:
From To Amount
A B $100
A C $40
A D $30
B A $25
B C $70
C A $190
C D $110
I want to summarize the total amount of transactions among each pair of individuals who have transactions with each other and the results should be something like:
Individual_1 Individual_2 Sum
A B $125
A C $230
A D $30
B C $70
C D $110
I tried to utilize the grouping feature of the package dplyr but I think it does not apply to my case.
You can use pmin/pmax to sort From and To columns and sum the Amount value.
library(dplyr)
df %>%
group_by(col1 = pmin(From, To),
col2 = pmax(From, To)) %>%
summarise(Amount = sum(readr::parse_number(Amount)))
# col1 col2 Amount
# <chr> <chr> <dbl>
#1 A B 125
#2 A C 230
#3 A D 30
#4 B C 70
#5 C D 110
Using the same logic in base R you can do :
aggregate(Amount~col1 + col2,
transform(df, col1 = pmin(From, To), col2 = pmax(From, To),
Amount = as.numeric(sub('$', '', Amount, fixed = TRUE))), sum)
data
df <- structure(list(From = c("A", "A", "A", "B", "B", "C", "C"), To = c("B",
"C", "D", "A", "C", "A", "D"), Amount = c("$100", "$40", "$30",
"$25", "$70", "$190", "$110")), class = "data.frame", row.names = c(NA, -7L))
A solution using the tidyverse package. You need to find a way to create a common grouping column with the right order of the individuals. dat2 is the final output.
library(tidyverse)
dat2 <- dat %>%
mutate(Amount = as.numeric(str_remove(Amount, "\\$"))) %>%
mutate(Group = map2_chr(From, To, ~str_c(sort(c(.x, .y)), collapse = "_"))) %>%
group_by(Group) %>%
summarize(Sum = sum(Amount, na.rm = TRUE)) %>%
separate(Group, into = c("Individual_1", "Individual_2"), sep = "_") %>%
mutate(Sum = str_c("$", Sum))
print(dat2)
# # A tibble: 5 x 3
# Individual_1 Individual_2 Sum
# <chr> <chr> <chr>
# 1 A B $125
# 2 A C $230
# 3 A D $30
# 4 B C $70
# 5 C D $110
Data
dat <- read.table(text = "From To Amount
A B $100
A C $40
A D $30
B A $25
B C $70
C A $190
C D $110",
header = TRUE)
A complete solution without packages, based on #RonakShah's great pmin/pmax approach, using list notation in aggregate (in contrast to formula notation) which allows name assignment.
with(
transform(d, a=as.numeric(gsub("\\D", "", Amount)), b=pmin(From, To), c=pmax(From, To)),
aggregate(list(Sum=a), list(Individual_1=b, Individual_2=c), function(x)
paste0("$", sum(x))))
# Individual_1 Individual_2 Sum
# 1 A B $125
# 2 A C $230
# 3 B C $70
# 4 A D $30
# 5 C D $110
Data:
d <- structure(list(From = c("A", "A", "A", "B", "B", "C", "C"), To = c("B",
"C", "D", "A", "C", "A", "D"), Amount = c("$100", "$40", "$30",
"$25", "$70", "$190", "$110")), class = "data.frame", row.names = c(NA,
-7L))

Getting columns less than any of the comma separated values in a column in R

I was wondering if anyone knows how I could keep rows that only have values <0.01 within a specific column with commas separating the values?
For instance
V1 V2 V3 V4
A B C D 0.02,0.0001,0.03,0.15
D E F G 0.05
S E G A 0.02,0.01
A B C D 0.001,0.05,0.3
And I would keep only the columns where V4 is < 0.01 in any of the comma separated values.
A B C D 0.02,0.0001,0.03,0.15
A B C D 0.001,0.05,0.3
I appreciate the help!
An option with base R involves splitting the 'V4' by ,, convert to numeric and check if any value less than 0.01 and then use that as index for subsetting
df1[sapply(strsplit(df1$V4, ","), function(x) any(as.numeric(x) < 0.01)),]
You can try this:
library(stringr)
#Data
df <- structure(list(V1 = c("A", "D", "S", "A"), V2 = c("B", "E", "E",
"B"), V3 = c("C", "F", "G", "C"), V4 = c("D", "G", "A", "D"),
X = c("0.02,0.0001,0.03,0.15", "0.05", "0.02,0.01", "0.001,0.05,0.3"
)), class = "data.frame", row.names = c(NA, -4L))
structure(list(V1 = c("A", "D", "S", "A"), V2 = c("B", "E", "E",
"B"), V3 = c("C", "F", "G", "C"), V4 = c("D", "G", "A", "D"),
X = c("0.02,0.0001,0.03,0.15", "0.05", "0.02,0.01", "0.001,0.05,0.3"
)), class = "data.frame", row.names = c(NA, -4L))
#Code
#Create an index
df$Index <- apply(A[,5,drop=F],1,function(x) sum(as.numeric(do.call(c,(str_split(x,','))))<0.01))
#Then filter
df1 <- df[df$Index==1,]
V1 V2 V3 V4 X Index
1 A B C D 0.02,0.0001,0.03,0.15 1
4 A B C D 0.001,0.05,0.3 1
Here is a way to get values in X column in separate rows, select only those rows where there is atleast one value less than 0.01 and combine them together.
library(dplyr)
df %>%
mutate(row = row_number()) %>%
tidyr::separate_rows(X, sep = ",", convert = TRUE) %>%
group_by(row) %>%
filter(any(X < 0.01)) %>%
group_by(V1, V2, V3, V4, .add = TRUE) %>%
#Or a bit simpler
#group_by(across(V1:V4), .add = TRUE) %>%
summarise(X = toString(X)) %>%
ungroup %>%
select(-row)
# V1 V2 V3 V4 X
# <chr> <chr> <chr> <chr> <chr>
#1 A B C D 0.02, 0.0001, 0.03, 0.15
#2 A B C D 0.001, 0.05, 0.3

Create a new column based on difference of dates

I have a dataframe in which I have to create a new column based on the difference of two dates. Example:
Col1 Col2 Col3 Date New_Column_Required
A X A 01/01/2001 Wave1
B Y Q 01/01/2001 Wave1
C Z N 01/01/2001 Wave1
D W M 02/01/2001 Wave2
E Q V 02/01/2001 Wave2
F R O 03/01/2001 Wave3
G S T 03/01/2001 Wave3
2nd date - 1st date should be wave 1, 3rd date - 2nd date Wave 2 and so on. The problem I'm facing is because of the multiple dates can't seem to figure out.
Using dplyr we could change Date to class Date, arrange them based on Date and subtract Date from the first value.
library(dplyr)
df %>%
mutate(Date = lubridate::dmy(Date)) %>%
arrange(Date) %>%
mutate(new_col = paste0("Wave", Date - first(Date) + 1))
#OR
#mutate(new_col = paste0("Wave", as.integer(as.factor(Date))))
# Col1 Col2 Col3 Date new_col
#1 A X A 2001-01-01 Wave1
#2 B Y Q 2001-01-01 Wave1
#3 C Z N 2001-01-01 Wave1
#4 D W M 2001-01-02 Wave2
#5 E Q V 2001-01-02 Wave2
#6 F R O 2001-01-03 Wave3
#7 G S T 2001-01-03 Wave3
And the same logic in base R :
df$Date = as.Date(df$Date, "%d/%m/%Y")
df <- df[order(df$Date), ]
transform(df, new_col = paste0('Wave', Date - Date[1] + 1))
data
df <- structure(list(Col1 = c("A", "B", "C", "D", "E", "F", "G"), Col2 = c("X",
"Y", "Z", "W", "Q", "R", "S"), Col3 = c("A", "Q", "N", "M", "V",
"O", "T"), Date = c("01/01/2001", "01/01/2001", "01/01/2001",
"02/01/2001", "02/01/2001", "03/01/2001", "03/01/2001")), row.names = c(NA,
-7L), class = "data.frame")

Resources