Add two new columns based on another data frame - r

Having a dataframe like this:
structure(list(id = c("id1", "id1", "id2", "id2", "id3", "id3"
), title_num = c(1, 2, 1, 2, 1, 2), title_name = c("amazon1",
"yahoo2", "google1", NA, "yahoo1", "amazon2")), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
and another one like this:
dfcheck <- structure(list(status = c("open/close", "close", "open"), stock = c("company energy",
"goods and books", "other"), name = c("amazon1;google1", "google3;yahoo1",
"yahoo2;amazon2;google2")), class = "data.frame", row.names = c(NA,
-3L))
How is it possible to have an output like this:
id title_num title_name stock status
id1 1 amazon1 company energy open/close
id1 2 yahoo2 other open
id2 1 google1 company energy open/close
id2 2 <NA> <NA> <NA>
id3 1 yahoo1 goods and books close
id3 2 amazon2 other open

library(dplyr)
df <-
structure(list(id = c("id1", "id1", "id2", "id2", "id3", "id3"
), title_num = c(1, 2, 1, 2, 1, 2), title_name = c("amazon1",
"yahoo2", "google1", NA, "yahoo1", "amazon2")), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
dfcheck <-
structure(list(status = c("open/close", "close", "open"), stock = c("company energy",
"goods and books", "other"), name = c("amazon1;google1", "google3;yahoo1",
"yahoo2;amazon2;google2")), class = "data.frame", row.names = c(NA,
-3L))
df %>%
left_join(
dfcheck %>%
separate_rows(name,sep = ";"),
by = c("title_name" = "name")
)
# A tibble: 6 x 5
id title_num title_name status stock
<chr> <dbl> <chr> <chr> <chr>
1 id1 1 amazon1 open/close company energy
2 id1 2 yahoo2 open other
3 id2 1 google1 open/close company energy
4 id2 2 NA NA NA
5 id3 1 yahoo1 close goods and books
6 id3 2 amazon2 open other

You can use left_join on a strsplit column of the second data set.
library(dplyr)
library(tidyr)
left_join(df1, dfcheck %>%
mutate(name = strsplit(name, ";")) %>%
unnest(name), c("title_name" = "name"))
# A tibble: 6 × 5
id title_num title_name status stock
<chr> <dbl> <chr> <chr> <chr>
1 id1 1 amazon1 open/close company energy
2 id1 2 yahoo2 open other
3 id2 1 google1 open/close company energy
4 id2 2 NA NA NA
5 id3 1 yahoo1 close goods and books
6 id3 2 amazon2 open other
Data
df1 <- structure(list(id = c("id1", "id1", "id2", "id2", "id3", "id3"
), title_num = c(1, 2, 1, 2, 1, 2), title_name = c("amazon1",
"yahoo2", "google1", NA, "yahoo1", "amazon2")), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
dfcheck <- structure(list(status = c("open/close", "close", "open"), stock = c("company energy",
"goods and books", "other"), name = c("amazon1;google1", "google3;yahoo1",
"yahoo2;amazon2;google2")), class = "data.frame", row.names = c(NA,
-3L))

Related

How to combine values with the same column name into a new dataframe in R?

I have the following dataset
Original dataset:
ID Col1 Col1 Col1 Col2 Col2 Col2
A Dog House
B Dog Car Bike
C Cat House
D Mouse Bike
Is there any way to create a new dataframe that combines all values with the same column name like below
Expected dataset:
ID Col1 Col2
A Dog House
B Dog Car, Bike
C Cat House
D Mouse Bike
You could do something like this:
df <- structure(list(
ID = c("A", "B", "C", "D"),
Col1 = c("Dog", "Dog", NA, NA),
Col1 = c(NA, NA, "Cat", NA),
Col1 = c(NA, NA, NA, "Mouse"),
Col2 = c("House", NA, "House", NA),
Col2 = c(NA, "Car", NA, NA),
Col2 = c(NA, "Bike", NA, "Bike")
),
class = c("data.frame"), row.names = c(NA, -4L)
)
library(dplyr)
library(tidyr)
library(purrr)
vars_to_unite <- unique(names(df))[unique(names(df)) != "ID"]
renamed_df <- as_tibble(df, .name_repair = "unique")
map_dfc(vars_to_unite,
~unite(
select(renamed_df, starts_with(.x)),
col = !!.x, sep = ", ", na.rm = TRUE
)) %>%
mutate(ID = df$ID)
#> # A tibble: 4 × 3
#> Col1 Col2 ID
#> <chr> <chr> <chr>
#> 1 Dog House A
#> 2 Dog Car, Bike B
#> 3 Cat House C
#> 4 Mouse Bike D
Created on 2022-06-01 by the reprex package (v2.0.1)
Base R solution:
# Input data: df => data.frame
df <- structure(list(
ID = c("A", "B", "C", "D"),
Col1 = c("Dog", "Dog", NA, NA),
Col1 = c(NA, NA, "Cat", NA),
Col1 = c(NA, NA, NA, "Mouse"),
Col2 = c("House", NA, "House", NA),
Col2 = c(NA, "Car", NA, NA),
Col2 = c(NA, "Bike", NA, "Bike")
),
class = c("data.frame"), row.names = c(NA, -4L)
)
# Split-Apply-Combine: res => data.frame
res <- data.frame(
do.call(
cbind,
lapply(
split.default(
df,
names(df)
),
function(x){
apply(
x,
1,
FUN = function(y){
toString(
na.omit(y)
)
}
)
}
)
)[,unique(names(df))],
stringsAsFactors = FALSE,
row.names = row.names(df)
)
# output Result: data.frame => stdout(console)
res

How do I specify pivot_wider for an entire dataframe?

I am able to pivot_wider for a specific column using the following:
new_df <- pivot_wider(old_df, names_from = col10, values_from = value_col, values_fn = list)
I would like to pivot_wider with every column in a dataframe (minus an id column). What is the best way to do this? Should I use a loop or is there a way that this function takes the whole dataframe?
To clarify, using the below sample dataframes, I am able to go from old_df to new_df using the pivot_wider function I listed above. I would like to now go from old_df2 to new_df2.
old_df <- structure(list(id = c("1", "1", "2"), col10 = c("yellow",
"green", "green"), value_col = c("1", "1", "1")), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"))
old_df2 <- structure(list(id = c("1", "1", "2"), col10 = c("yellow",
"green", "green"), col11 = c("dog",
"cat", "dog"), value_col = c("1", "1", "1")), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"))
new_df <- pivot_wider(old_df, names_from = col10, values_from = value_col, values_fn = list)
new_df2 <- structure(list(id = c("1", "2"), yellow = c("1", "NULL"), green = c("1", "1"), dog = c("1", "1"), cat = c("1", "NULL")), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"))
If you would like to have separate column names for each value between these two columns (or any number of columns) you first need to use pivot_longer to put all the column names into a single column and then use pivot_wider to spread them:
library(tidyr)
old_df2 %>%
pivot_longer(!c(id, value_col), names_to = "Cols", values_to = "vals") %>%
pivot_wider(names_from = vals, values_from = value_col) %>%
select(-Cols) %>%
group_by(id) %>%
summarise(across(everything(), ~ sum(as.numeric(.x), na.rm = TRUE)))
# A tibble: 2 x 5
id yellow dog green cat
<chr> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 1 1
2 2 0 1 1 0
Update 1
As per your update, here comes with a data.table option
dcast(
melt(setDT(old_df),
id.var = "id",
measure.vars = patterns("^col\\d+")
),
id ~ value,
fun.aggregate = length,
fill = NA
)
which gives
id cat dog green yellow
1: 1 1 1 1 1
2: 2 NA 1 1 NA
Are you looking for something like below?
reshape(
transform(
old_df,
q = ave(id, id, FUN = seq_along)
),
direction = "wide",
idvar = "id",
timevar = "q"
)
The output is
id col10.1 col11.1 value_col.1 col10.2 col11.2 value_col.2
1 1 yellow dog 1 green cat 1
3 2 green dog 1 <NA> <NA> <NA>
You could combine those columns and unnest them followed by pivot_wider:
library(tidyr)
library(dplyr)
old_df2 <- structure(list(id = c("1", "1", "2"), col10 = c("yellow",
"green", "green"), col11 = c("dog",
"cat", "dog"), value_col = c("1", "1", "1")), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"))
old_df2 %>%
mutate(new_col = strsplit(paste(col10, col11, sep = "_"), "_"), .keep = "unused") %>%
unnest(new_col) %>%
pivot_wider(names_from = new_col, values_from = value_col)
#> # A tibble: 2 x 5
#> id yellow dog green cat
#> <chr> <chr> <chr> <chr> <chr>
#> 1 1 1 1 1 1
#> 2 2 <NA> 1 1 <NA>
Created on 2021-08-25 by the reprex package (v2.0.1)

Combine two matrix and mark common

I have two matrix as like this
Vehicle1 Year type
Car1 20 A
Car2 21 A
Car8 20 A
Second one
Vehicle2 Year type
Car1 20 M
Car2 21 M
Car7 90 M
I just need to combine the matrix based on the first column(Vehicle) and need to mark common as A/M as like this
Vehicle Year type
Car1 20 A/M
Car2 21 A/M
Car7 90 M
Car8 20 A
I used merge function for this but it only printing the common one
You can join the two dataframe and combine the type columns :
dplyr::full_join(df1, df2, by = c('Vehicle1' = 'Vehicle2', 'Year')) %>%
tidyr::unite(type, type.x, type.y, sep = '/', na.rm = TRUE)
# Vehicle1 Year type
#1 Car1 20 A/M
#2 Car2 21 A/M
#3 Car8 20 A
#4 Car7 90 M
data
df1 <- structure(list(Vehicle1 = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(Vehicle2 = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA, -3L))
Another dplyr solution.
library(dplyr)
df2 %>%
rename(Vehicle1 = Vehicle2) %>%
bind_rows(df1, .) %>%
group_by(Vehicle1, Year) %>%
summarise(type = paste(type, collapse = "/")) %>%
ungroup()
# # A tibble: 4 x 3
# Vehicle1 Year type
# <chr> <int> <chr>
# 1 Car1 20 A/M
# 2 Car2 21 A/M
# 3 Car7 90 M
# 4 Car8 20 A
You can also do this easily in base R.
rr <- merge(m1, m2, all=T, by.x="Vehicle1", by.y="Vehicle2")
rr <- setNames(na.omit(reshape(rr, idvar="Vehicle1", varying=list(c(2, 4), c(3, 5)),
direction="long")), c("Vehicle1", "t", names(m1)[-1]))
dupes <- which(duplicated(rr$Vehicle1))
rr[rr$Vehicle1 %in% rr$Vehicle1[dupes], 4] <- "A/M"
res <- rr[-dupes, -2]
res
# Vehicle1 Year type
# Car1.1 Car1 20 A/M
# Car2.1 Car2 21 A/M
# Car8.1 Car8 20 A
# Car7.2 Car7 90 M
Data:
m1 <- structure(list(Vehicle1 = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA,
-3L))
m2 <- structure(list(Vehicle2 = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA,
-3L))
Here is a base R option using merge
tmp <- merge(df1, df2, by.x = c("Vehicle1", "Year"), by.y = c("Vehicle2", "Year"), all = TRUE)
dfout <- cbind(tmp[c("Vehicle1", "Year")],
type = apply(
tmp[grep("type", names(tmp))],
1,
function(...) ifelse(any(is.na(...)), na.omit(...), paste0(..., collapse = "/"))
)
)
such that
> dfout
Vehicle1 Year type
1 Car1 20 A/M
2 Car2 21 A/M
3 Car7 90 M
4 Car8 20 A
Data
> dput(df1)
structure(list(Vehicle1 = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA,
-3L))
> dput(df2)
structure(list(Vehicle2 = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA,
-3L))

In R how can I filter a column which name is a number

this is my code
centros <- eventos %>%
filter(type_id==1 & 2:1 & Name == 'jack')
data sample
type_id Name 2
1 jack 1
2 Mary NA
4 Peter 1
Thanks in advance
We can use backquotes
library(dplyr)
centros <- eventos %>%
filter(type_id==1, `2` == 1, Name == 'jack')
centros
# type_id Name 2
#1 1 jack 1
data
eventos <- structure(list(type_id = c(1L, 2L, 4L), Name = c("jack", "Mary",
"Peter"), `2` = c(1L, NA, 1L)), class = "data.frame", row.names = c(NA,
-3L))

R: rename columns in list based on other row value

I have imported data from matlab and have a large list (over 1000 list elements) from which I created the following sample dataset data with only two list elements.
data <- structure(list(TEST.DATA.1.1 = structure(list(ID = c(2, 2, 2), YEAR = c(1990, 1991, 1992), DATA.1 = c(10, 20, 30), DATA.NAME = structure(c(1L, 1L, 1L), class = "factor", .Label = "Test"), Remarks = c(1990, 1991, 1992)), .Names = c("ID", "YEAR", "DATA.1", "DATA.NAME", "Remarks"), row.names = c(NA, -3L), class = "data.frame"), TEST.DATA.2.1 = structure(list(ID = c(4, 4), YEAR = c(2000, 2001), DATA.1 = c(55, 60), DATA.2 = c(0, 2), DATA.3 = c(4, 6), DATA.NAME.structure..n1....Dim...c.1L..1L.. = structure(c(1L,1L), class = "factor", .Label = "n1"), DATA.NAME.structure..n2....Dim...c.1L..1L.. = structure(c(1L, 1L), class = "factor", .Label = "n2"), DATA.NAME.structure..n3....Dim...c.1L..1L.. = structure(c(1L,1L), class = "factor", .Label = "n3"), Remarks = c(2000,2001)), .Names = c("ID", "YEAR", "DATA.1", "DATA.2", "DATA.3", "DATA.NAME.structure..n1....Dim...c.1L..1L..", "DATA.NAME.structure..n2....Dim...c.1L..1L..", "DATA.NAME.structure..n3....Dim...c.1L..1L..", "Remarks"), row.names = c(NA, -2L), class = "data.frame")), .Names = c("TEST.DATA.1.1", "TEST.DATA.2.1"))
data
$TEST.DATA.1.1
ID YEAR DATA.1 DATA.NAME Remarks
1 2 1990 10 Test 1990
2 2 1991 20 Test 1991
3 2 1992 30 Test 1992
$TEST.DATA.2.1
ID YEAR DATA.1 DATA.2 DATA.3 DATA.NAME.structure..n1....Dim...c.1L..1L.. DATA.NAME.structure..n2....Dim...c.1L..1L.. DATA.NAME.structure..n3....Dim...c.1L..1L.. Remarks
1 4 2000 55 0 4 n1 n2 n3 2000
2 4 2001 60 2 6 n1 n2 n3 2001
I am looking for a way how I could rename the data columns with the name from the column(s) DATA.NAME. Sometimes there are multiple data columns and respective names such as in the second list element and sometimes there is only one such as in the first element. I am looking for a way to do the renaming for a large list (> 1000 list elements) and then drop the DATA.NAME columns such as in data_new.
data_new
$TEST.DATA.1.1
ID YEAR Test Remarks
1 2 1990 10 1990
2 2 1991 20 1991
3 2 1992 30 1992
$TEST.DATA.2.1
ID YEAR n1 n2 n3 Remarks
1 4 2000 55 0 4 2000
2 4 2001 60 2 6 2001
Here's a base R approach:
for (i in seq_along(data)) {
namecis <- grep('^DATA\\.NAME',names(data[[i]]));
datacis <- grep('^DATA\\.\\d+',names(data[[i]]));
names(data[[i]])[datacis] <- as.character(unlist(data[[i]][1,namecis]));
data[[i]][namecis] <- list(NULL);
};
data;
## $TEST.DATA.1.1
## ID YEAR Test Remarks
## 1 2 1990 10 1990
## 2 2 1991 20 1991
## 3 2 1992 30 1992
##
## $TEST.DATA.2.1
## ID YEAR n1 n2 n3 Remarks
## 1 4 2000 55 0 4 2000
## 2 4 2001 60 2 6 2001
Solution using data.table package.
require(data.table)
data <- structure(list(TEST.DATA.1.1 = structure(list(ID = c(2, 2, 2), YEAR = c(1990, 1991, 1992), DATA.1 = c(10, 20, 30), DATA.NAME = structure(c(1L, 1L, 1L), class = "factor", .Label = "Test"), Remarks = c(1990, 1991, 1992)), .Names = c("ID", "YEAR", "DATA.1", "DATA.NAME", "Remarks"), row.names = c(NA, -3L), class = "data.frame"), TEST.DATA.2.1 = structure(list(ID = c(4, 4), YEAR = c(2000, 2001), DATA.1 = c(55, 60), DATA.2 = c(0, 2), DATA.3 = c(4, 6), DATA.NAME.structure..n1....Dim...c.1L..1L.. = structure(c(1L,1L), class = "factor", .Label = "n1"), DATA.NAME.structure..n2....Dim...c.1L..1L.. = structure(c(1L, 1L), class = "factor", .Label = "n2"), DATA.NAME.structure..n3....Dim...c.1L..1L.. = structure(c(1L,1L), class = "factor", .Label = "n3"), Remarks = c(2000,2001)), .Names = c("ID", "YEAR", "DATA.1", "DATA.2", "DATA.3", "DATA.NAME.structure..n1....Dim...c.1L..1L..", "DATA.NAME.structure..n2....Dim...c.1L..1L..", "DATA.NAME.structure..n3....Dim...c.1L..1L..", "Remarks"), row.names = c(NA, -2L), class = "data.frame")), .Names = c("TEST.DATA.1.1", "TEST.DATA.2.1"))
fun <- function(x) {
x <- data.table(x)
var1 <- grep("DATA.[0-9]", names(x), value = T)
var2 <- as.character(unlist(x[1, grep("DATA.NAME", names(x)), with = F]))
setnames(x, var1, var2)
x[, grep("DATA.NAME", names(x)) := NULL, with = F]
return(x)
}
data_new <- lapply(data, fun)
This should work...
library(dplyr)
for (i in 1:length(data))
{
d <- data[[i]]
# Find the new names
new_names <- select(d, starts_with('DATA.NAME'))
new_names <- unlist(new_names[1,])
names(new_names) <- NULL
new_names <- as.character(new_names)
# Remove the columns containing the names
d <- select(d, -starts_with('DATA.NAME'))
# Pick which columns we want to replace
old_names <- names(d)
to_replace <- grep('DATA.[0-9]+', old_names)
# Replace those names
names(d)[to_replace] <- new_names
#Replace the list element
data[[i]] <- d
}

Resources