I'm still learning R and was wondering if I there was an elegant way of manipulating the below df to achieve df2.
I'm not sure if it's a loop that is supposed to be used for this, but basically I want to extract the first Non NA "X_No" Value if the "X_No" value is NA in the first row. This would perhaps be best described through an example from df to the desired df2.
A_ID <- c('A','B','I','N')
A_No <- c(11,NA,15,NA)
B_ID <- c('B','C','D','J')
B_No <- c(NA,NA,12,NA)
C_ID <- c('E','F','G','P')
C_No <- c(NA,13,14,20)
D_ID <- c('J','K','L','M')
D_No <- c(NA,NA,NA,40)
E_ID <- c('W','X','Y','Z')
E_No <- c(50,32,48,40)
df <- data.frame(A_ID,A_No,B_ID,B_No,C_ID,C_No,D_ID,D_No,E_ID,E_No)
ID <- c('A','D','F','M','W')
No <- c(11,12,13,40,50)
df2 <- data.frame(ID,No)
I'm hoping for an elegant solution to this as there are over a 1000 columns similar to the example provided.
I've looked all over the web for a similar example however to no avail that would reproduce the expected result.
Your help is very much appreciated.
Thankyou
I don't know if I'd call it "elegant", but here is a potential solution:
library(tidyverse)
A_ID <- c('A','B','I','N')
A_No <- c(11,NA,15,NA)
B_ID <- c('B','C','D','J')
B_No <- c(NA,NA,12,NA)
C_ID <- c('E','F','G','P')
C_No <- c(NA,13,14,20)
D_ID <- c('J','K','L','M')
D_No <- c(NA,NA,NA,40)
E_ID <- c('W','X','Y','Z')
E_No <- c(50,32,48,40)
df <- data.frame(A_ID,A_No,B_ID,B_No,C_ID,C_No,D_ID,D_No,E_ID,E_No)
ID <- c('A','D','F','M','W')
No <- c(11,12,13,40,50)
df2 <- data.frame(ID,No)
output <- df %>%
pivot_longer(everything(),
names_sep = "_",
names_to = c("Col", ".value")) %>%
drop_na() %>%
group_by(Col) %>%
slice_head(n = 1) %>%
ungroup() %>%
select(-Col)
df2
#> ID No
#> 1 A 11
#> 2 D 12
#> 3 F 13
#> 4 M 40
#> 5 W 50
output
#> # A tibble: 5 × 2
#> ID No
#> <chr> <dbl>
#> 1 A 11
#> 2 D 12
#> 3 F 13
#> 4 M 40
#> 5 W 50
all_equal(df2, output)
#> [1] TRUE
Created on 2023-02-08 with reprex v2.0.2
Using base R with max.col (assuming the columns are alternating with ID, No)
ind <- max.col(!is.na(t(df[c(FALSE, TRUE)])), "first")
m1 <- cbind(seq_along(ind), ind)
data.frame(ID = t(df[c(TRUE, FALSE)])[m1], No = t(df[c(FALSE, TRUE)])[m1])
ID No
1 A 11
2 D 12
3 F 13
4 M 40
5 W 50
Here is a data.table solution that should scale well to a (very) large dataset.
functionally
split the data.frame to a list of chunks of columns, based on their
names. So all columns startting with A_ go to
the first element, all colums startting with B_ to the second
Then, put these list elements on top of each other, using
data.table::rbindlist. Ignure the column-namaes (this only works if
A_ has the same number of columns as B_ has the same number of cols
as n_)
Now get the first non-NA value of each value in the first column
code
library(data.table)
# split based on what comes after the underscore
L <- split.default(df, f = gsub("(.*)_.*", "\\1", names(df)))
# bind together again
DT <- rbindlist(L, use.names = FALSE)
# extract the first value of the non-NA
DT[!is.na(A_No), .(No = A_No[1]), keyby = .(ID = A_ID)]
# ID No
# 1: A 11
# 2: D 12
# 3: F 13
# 4: G 14
# 5: I 15
# 6: M 40
# 7: P 20
# 8: W 50
# 9: X 32
#10: Y 48
#11: Z 40
I have a factor df that I would like it to be need it to be numerical/dummy. I used as.integer to each column and then made a cbind to the original data frame. Is there a way to do all columns at once?
data <- data.frame(
x = c('a','b','c'),
y = c('d','e','f'),
z = c('g','h','i'),
stringsAsFactors = TRUE
)
x_factor <- as.integer(data$x)
y_factor <- as.integer(data$y)
z_factor <- as.integer(data$z)
data_binded <- cbind(a,x_factor,y_factor,z_factor)
Here is dplyr solution:
library(dplyr)
data %>%
mutate(across(ends_with("factor"), as.numeric))
x y z x_factor y_factor z_factor
1 a d g 1 1 1
2 b e h 2 2 2
3 c f i 3 3 3
I'm trying to iterate over throws of a data frame and get access to values in the columns of each row. Perhaps, I need a paradigm shift. I've attempted a vectorization approach. My ultimate objective is to use specific column values in each row to filter another data frame.
Any help would be appreciated.
df <- data.frame(a = 1:3, b = letters[24:26], c = 7:9)
f <- function(row) {
var1 <- row$a
var2 <- row$b
var3 <- row$c
}
pmap(df, f)
Is there a way to do this in purrr?
Using pmap, we can do
library(purrr)
pmap(df, ~ f(list(...)))
#[[1]]
#[1] 7
#[[2]]
#[1] 8
#[[3]]
#[1] 9
Or use rowwise with cur_data
library(dplyr)
df %>%
rowwise %>%
transmute(new = f(cur_data()))
-output
# A tibble: 3 x 1
# Rowwise:
# new
# <int>
#1 7
#2 8
#3 9
library(tidyverse)
df <- data.frame(a = 1:3, b = letters[24:26], c = 7:9)
f <- function(row) {
var1 <- row$a
var2 <- row$b
var3 <- row$c
}
df %>%
split(rownames(.)) %>%
map(
~f(.x)
)
I have data as follows:
DT <- as.data.frame(c("1","2", "3", "A", "B"))
names(DT)[1] <- "charnum"
What I want is quite simple, but I could not find an example on it on stackoverflow.
I want to split the dataset into two. DT1 with all the rows for which DT$charnum has numbers and DT2 with all the rows for which DT$charnum has letters. I tried something like:
DT1 <- DT[is.numeric(as.numeric(DT$charnum)),]
But that gives:
[1] 1 2 3 A B
Levels: 1 2 3 A B
Desired result:
> DT1
charnum
1 1
2 2
3 3
> DT2
charnum
1 A
2 B
You can use regular expressions to separate the two types of data that you have and then separate the two datasets.
result <- split(DT, grepl('^\\d+$', DT$charnum))
DT1 <- type.convert(result[[1]])
DT1
# charnum
#4 A
#5 B
DT2 <- type.convert(result[[2]])
DT2
# charnum
#1 1
#2 2
#3 3
Using tidyverse
library(dplyr)
library(purrr)
library(stringr)
DT %>%
group_split(grp = str_detect(charnum, "\\d+"), .keep = FALSE) %>%
map(type.convert, as.is = TRUE)
I have a data set like this
x y z
a 5 4
b 1 2
And i want concat columns and rows :
ay 5
az 4
by 1
bz 2
Thanks
You can use melt, and paste but you will need to make your rownames a variable, i..e
df$new <- rownames(df)
m_df <- reshape2::melt(df)
rownames(m_df) <- paste0(m_df$new, m_df$variable)
m_df <- m_df[-c(1:2)]
m_df
# value
#ax 5
#bx 1
#ay 4
#by 2
#az 3
#bz 1
After your edit, you don't need to convert rownames to a variable so just,
m1_df <- reshape2::melt(df)
m1_df$new <- paste0(m1_df$x, m1_df$variable)
m1_df
# x variable value new
#1 a y 5 ay
#2 b y 1 by
#3 a z 4 az
#4 b z 2 bz
You can then tidy your data frame to required output
with dplyr-tidyr
library(dplyr)
library(tidyr)
df %>%
gather(var, val, -x) %>%
mutate(var=paste0(x, var)) %>%
select(var, val)%>%
arrange(var)
# var val
#1 ay 5
#2 az 4
#3 by 1
#4 bz 2
library(reshape2)
library(dplyr)
library(tibble)
library(stringr)
# Create dataframe
x <- data.frame(x = c(5, 1),
y = c(4, 2),
z = c(3, 1),
row.names = c('a', 'b'))
# Convert rowname to column and melt
x <- tibble::rownames_to_column(x, "rownames") %>%
melt('rownames')
# assign concat columns as rownames
row.names(x) <- str_c(x$rownames, x$variable)
# Select relevant columns only
x <- select(x, value)
# Remove names from dataframe
names(x) <- NULL
> x
ax 5
bx 1
ay 4
by 2
az 3
bz 1
Here is another option in base R
stack(setNames(as.list(unlist(df1[-1])), outer(df1$x, names(df1)[-1], paste0)))[2:1]