subsetting a dataframe by columnames which are dates - r

I would like to subset a data.frame based on the dates in the rownames. My dates are of this format:
192707
192708
192709
df$Date <- as.yearmon(as.character(df$Date), "%Y%m")
edit: I set the rownames equal to the Date variabel like this (and would like to delete Date afterwards):
rownames(df)<-df$Date
I thought of subsetting like this:
train_dates <- seq(as.yearmon(as.character("1959-12-31"), "%Y%m"), as.yearmon(as.character("1984-12-31"), "%Y%m", "months"))
df <- subset(df, rownames(df) %in% train_dates)
or
df[train_dates,]
But I am having difficulties creating the correct sequence.

Try using format
train_dates <- format(seq(as.Date.character('1959-01-31'),
as.Date.character('1959-12-31'), by = 'month'), '%Y%m')
and then, using library(data.table)
df <- as.data.table(df)
train_df <- df[Date %in% train_dates]

One solution could be using rownames_to_column from tibble package.
#data
df <- data.frame(A = 1:5, B = letters[1:5])
rownames(df) <- c("195901", "196008", "196109", "201812", "196112")
# A B
# 195901 1 a
# 196008 2 b
# 196109 3 c
# 201812 4 d # not in train_dates
# 196112 5 e
library(zoo)
#create sequence from 1959 to 1968. Lookup table
train_dates <- format(as.yearmon(1959 + seq(0, 119)/12), format="%Y%m")
Option #1:
library(tidyverse)
df %>%
rownames_to_column("datemon") %>%
filter(datemon %in% train_dates) %>%
column_to_rownames("datemon")
# A B
# 195901 1 a
# 196008 2 b
# 196109 3 c
# 196112 5 e
Option #2
df[rownames(df) %in% train_dates, ]
# A B
# 195901 1 a
# 196008 2 b
# 196109 3 c
# 196112 5 e

Related

Extract first Non NA value over multiple columns

I'm still learning R and was wondering if I there was an elegant way of manipulating the below df to achieve df2.
I'm not sure if it's a loop that is supposed to be used for this, but basically I want to extract the first Non NA "X_No" Value if the "X_No" value is NA in the first row. This would perhaps be best described through an example from df to the desired df2.
A_ID <- c('A','B','I','N')
A_No <- c(11,NA,15,NA)
B_ID <- c('B','C','D','J')
B_No <- c(NA,NA,12,NA)
C_ID <- c('E','F','G','P')
C_No <- c(NA,13,14,20)
D_ID <- c('J','K','L','M')
D_No <- c(NA,NA,NA,40)
E_ID <- c('W','X','Y','Z')
E_No <- c(50,32,48,40)
df <- data.frame(A_ID,A_No,B_ID,B_No,C_ID,C_No,D_ID,D_No,E_ID,E_No)
ID <- c('A','D','F','M','W')
No <- c(11,12,13,40,50)
df2 <- data.frame(ID,No)
I'm hoping for an elegant solution to this as there are over a 1000 columns similar to the example provided.
I've looked all over the web for a similar example however to no avail that would reproduce the expected result.
Your help is very much appreciated.
Thankyou
I don't know if I'd call it "elegant", but here is a potential solution:
library(tidyverse)
A_ID <- c('A','B','I','N')
A_No <- c(11,NA,15,NA)
B_ID <- c('B','C','D','J')
B_No <- c(NA,NA,12,NA)
C_ID <- c('E','F','G','P')
C_No <- c(NA,13,14,20)
D_ID <- c('J','K','L','M')
D_No <- c(NA,NA,NA,40)
E_ID <- c('W','X','Y','Z')
E_No <- c(50,32,48,40)
df <- data.frame(A_ID,A_No,B_ID,B_No,C_ID,C_No,D_ID,D_No,E_ID,E_No)
ID <- c('A','D','F','M','W')
No <- c(11,12,13,40,50)
df2 <- data.frame(ID,No)
output <- df %>%
pivot_longer(everything(),
names_sep = "_",
names_to = c("Col", ".value")) %>%
drop_na() %>%
group_by(Col) %>%
slice_head(n = 1) %>%
ungroup() %>%
select(-Col)
df2
#> ID No
#> 1 A 11
#> 2 D 12
#> 3 F 13
#> 4 M 40
#> 5 W 50
output
#> # A tibble: 5 × 2
#> ID No
#> <chr> <dbl>
#> 1 A 11
#> 2 D 12
#> 3 F 13
#> 4 M 40
#> 5 W 50
all_equal(df2, output)
#> [1] TRUE
Created on 2023-02-08 with reprex v2.0.2
Using base R with max.col (assuming the columns are alternating with ID, No)
ind <- max.col(!is.na(t(df[c(FALSE, TRUE)])), "first")
m1 <- cbind(seq_along(ind), ind)
data.frame(ID = t(df[c(TRUE, FALSE)])[m1], No = t(df[c(FALSE, TRUE)])[m1])
ID No
1 A 11
2 D 12
3 F 13
4 M 40
5 W 50
Here is a data.table solution that should scale well to a (very) large dataset.
functionally
split the data.frame to a list of chunks of columns, based on their
names. So all columns startting with A_ go to
the first element, all colums startting with B_ to the second
Then, put these list elements on top of each other, using
data.table::rbindlist. Ignure the column-namaes (this only works if
A_ has the same number of columns as B_ has the same number of cols
as n_)
Now get the first non-NA value of each value in the first column
code
library(data.table)
# split based on what comes after the underscore
L <- split.default(df, f = gsub("(.*)_.*", "\\1", names(df)))
# bind together again
DT <- rbindlist(L, use.names = FALSE)
# extract the first value of the non-NA
DT[!is.na(A_No), .(No = A_No[1]), keyby = .(ID = A_ID)]
# ID No
# 1: A 11
# 2: D 12
# 3: F 13
# 4: G 14
# 5: I 15
# 6: M 40
# 7: P 20
# 8: W 50
# 9: X 32
#10: Y 48
#11: Z 40

Create a numerical df in r using factor df

I have a factor df that I would like it to be need it to be numerical/dummy. I used as.integer to each column and then made a cbind to the original data frame. Is there a way to do all columns at once?
data <- data.frame(
x = c('a','b','c'),
y = c('d','e','f'),
z = c('g','h','i'),
stringsAsFactors = TRUE
)
x_factor <- as.integer(data$x)
y_factor <- as.integer(data$y)
z_factor <- as.integer(data$z)
data_binded <- cbind(a,x_factor,y_factor,z_factor)
Here is dplyr solution:
library(dplyr)
data %>%
mutate(across(ends_with("factor"), as.numeric))
x y z x_factor y_factor z_factor
1 a d g 1 1 1
2 b e h 2 2 2
3 c f i 3 3 3

In R, iterate over a data.frame by row and get access to the column values

I'm trying to iterate over throws of a data frame and get access to values in the columns of each row. Perhaps, I need a paradigm shift. I've attempted a vectorization approach. My ultimate objective is to use specific column values in each row to filter another data frame.
Any help would be appreciated.
df <- data.frame(a = 1:3, b = letters[24:26], c = 7:9)
f <- function(row) {
var1 <- row$a
var2 <- row$b
var3 <- row$c
}
pmap(df, f)
Is there a way to do this in purrr?
Using pmap, we can do
library(purrr)
pmap(df, ~ f(list(...)))
#[[1]]
#[1] 7
#[[2]]
#[1] 8
#[[3]]
#[1] 9
Or use rowwise with cur_data
library(dplyr)
df %>%
rowwise %>%
transmute(new = f(cur_data()))
-output
# A tibble: 3 x 1
# Rowwise:
# new
# <int>
#1 7
#2 8
#3 9
library(tidyverse)
df <- data.frame(a = 1:3, b = letters[24:26], c = 7:9)
f <- function(row) {
var1 <- row$a
var2 <- row$b
var3 <- row$c
}
df %>%
split(rownames(.)) %>%
map(
~f(.x)
)

Subsetting data, if the column entry contains letters

I have data as follows:
DT <- as.data.frame(c("1","2", "3", "A", "B"))
names(DT)[1] <- "charnum"
What I want is quite simple, but I could not find an example on it on stackoverflow.
I want to split the dataset into two. DT1 with all the rows for which DT$charnum has numbers and DT2 with all the rows for which DT$charnum has letters. I tried something like:
DT1 <- DT[is.numeric(as.numeric(DT$charnum)),]
But that gives:
[1] 1 2 3 A B
Levels: 1 2 3 A B
Desired result:
> DT1
charnum
1 1
2 2
3 3
> DT2
charnum
1 A
2 B
You can use regular expressions to separate the two types of data that you have and then separate the two datasets.
result <- split(DT, grepl('^\\d+$', DT$charnum))
DT1 <- type.convert(result[[1]])
DT1
# charnum
#4 A
#5 B
DT2 <- type.convert(result[[2]])
DT2
# charnum
#1 1
#2 2
#3 3
Using tidyverse
library(dplyr)
library(purrr)
library(stringr)
DT %>%
group_split(grp = str_detect(charnum, "\\d+"), .keep = FALSE) %>%
map(type.convert, as.is = TRUE)

Concatenate rows and columns

I have a data set like this
x y z
a 5 4
b 1 2
And i want concat columns and rows :
ay 5
az 4
by 1
bz 2
Thanks
You can use melt, and paste but you will need to make your rownames a variable, i..e
df$new <- rownames(df)
m_df <- reshape2::melt(df)
rownames(m_df) <- paste0(m_df$new, m_df$variable)
m_df <- m_df[-c(1:2)]
m_df
# value
#ax 5
#bx 1
#ay 4
#by 2
#az 3
#bz 1
After your edit, you don't need to convert rownames to a variable so just,
m1_df <- reshape2::melt(df)
m1_df$new <- paste0(m1_df$x, m1_df$variable)
m1_df
# x variable value new
#1 a y 5 ay
#2 b y 1 by
#3 a z 4 az
#4 b z 2 bz
You can then tidy your data frame to required output
with dplyr-tidyr
library(dplyr)
library(tidyr)
df %>%
gather(var, val, -x) %>%
mutate(var=paste0(x, var)) %>%
select(var, val)%>%
arrange(var)
# var val
#1 ay 5
#2 az 4
#3 by 1
#4 bz 2
library(reshape2)
library(dplyr)
library(tibble)
library(stringr)
# Create dataframe
x <- data.frame(x = c(5, 1),
y = c(4, 2),
z = c(3, 1),
row.names = c('a', 'b'))
# Convert rowname to column and melt
x <- tibble::rownames_to_column(x, "rownames") %>%
melt('rownames')
# assign concat columns as rownames
row.names(x) <- str_c(x$rownames, x$variable)
# Select relevant columns only
x <- select(x, value)
# Remove names from dataframe
names(x) <- NULL
> x
ax 5
bx 1
ay 4
by 2
az 3
bz 1
Here is another option in base R
stack(setNames(as.list(unlist(df1[-1])), outer(df1$x, names(df1)[-1], paste0)))[2:1]

Resources