Extracting elements from a list to create a matrix - r

I have a list of countries with lists inside each one of them.
Just to give you an example of a list object for one country with lists for two countries (df_DOTS):
df_DOTS <- list(BR = structure(list(`#FREQ` = "M", `#REF_AREA` = "AU", `#INDICATOR` = "TXG_FOB_USD",
`#COUNTERPART_AREA` = "BR", `#UNIT_MULT` = "6", `#TIME_FORMAT` = "P1M",
Obs = list(structure(list(`#TIME_PERIOD` = c("2019-07", "2019-08",
"2019-09"), `#OBS_VALUE` = c("55.687747", "36.076581", "57.764474"
)), class = "data.frame", row.names = c(NA, 3L)))), row.names = 2L, class = "data.frame"),
US = structure(list(`#FREQ` = "M", `#REF_AREA` = "AU", `#INDICATOR` = "TXG_FOB_USD",
`#COUNTERPART_AREA` = "US", `#UNIT_MULT` = "6", `#TIME_FORMAT` = "P1M",
Obs = list(structure(list(`#TIME_PERIOD` = c("2019-07",
"2019-08", "2019-09"), `#OBS_VALUE` = c("876.025841",
"872.02118", "787.272851")), class = "data.frame", row.names = c(NA,
3L)))), row.names = 1L, class = "data.frame"))
I can reach the matrix (matrix_DOTS) I am looking for using these lines of code:
library(dplyr)
library(rlist)
library(magrittr)
BR <- df_DOTS[["BR"]][["Obs"]] %>%
list.select(.$`#OBS_VALUE`) %>%
unlist() %>%
sapply(function(x) as.numeric(as.character(x))) %>%
mean()
US <- df_DOTS[["US"]][["Obs"]] %>%
list.select(.$`#OBS_VALUE`) %>%
unlist() %>%
sapply(function(x) as.numeric(as.character(x))) %>%
mean()
matrix_DOTS <- matrix(c(BR, US), nrow = 1, dimnames = list(c("AU"), c("BR", "US")))
Since I have a list of several countries with lists of other several countries inside them, I am looking for a more practical way of achieving matrix_DOTS. Any help is highly appreciated!
PS: This is the dput for the final matrix in this example:
matrix_DOTS <- structure(c(49.842934, 845.106624), .Dim = 1:2, .Dimnames = list(
"AU", c("BR", "US")))
EDIT
This is the procedure to obtain df_DOTS:
library(IMFData)
databaseID <- "DOT"
startdate = "2019-07-01"
enddate = "2019-09-01"
checkquery = FALSE
queryfilter <- list(CL_FREQ = "M", CL_AREA_DOT = "AU",
CL_INDICATOR_DOT = "TXG_FOB_USD",
CL_COUNTERPART_AREA_DOT = c("BR", "US"))
df_DOTS <- CompactDataMethod(databaseID, queryfilter, startdate, enddate, checkquery) %>%
split(.$`#COUNTERPART_AREA`)

Just add tidy = TRUE to the CompactDataMethod call:
library(IMFData)
databaseID <- "DOT"
startdate = "2019-07-01"
enddate = "2019-09-01"
checkquery = FALSE
queryfilter <- list(CL_FREQ = "M", CL_AREA_DOT = "AU",
CL_INDICATOR_DOT = "TXG_FOB_USD",
CL_COUNTERPART_AREA_DOT = c("BR", "US"))
df_DOTS <- CompactDataMethod(databaseID,
queryfilter,
startdate,
enddate,
checkquery,
tidy = TRUE)
df_DOTS
#TIME_PERIOD #OBS_VALUE #FREQ #REF_AREA #INDICATOR #COUNTERPART_AREA #UNIT_MULT #TIME_FORMAT
1 2019-07 876.025841 M AU TXG_FOB_USD US 6 P1M
2 2019-08 872.02118 M AU TXG_FOB_USD US 6 P1M
3 2019-09 787.272851 M AU TXG_FOB_USD US 6 P1M
4 2019-07 55.687747 M AU TXG_FOB_USD BR 6 P1M
5 2019-08 36.076581 M AU TXG_FOB_USD BR 6 P1M
6 2019-09 57.764474 M AU TXG_FOB_USD BR 6 P1M
you just need one group_by(#COUNTERPART_AREA) %>% summarise(mean = mean(#OBS_VALUE)):
library(tidyverse)
df_DOTS %>%
group_by(`#COUNTERPART_AREA`, `#REF_AREA`) %>%
summarise(mean = mean(as.numeric(`#OBS_VALUE`))) %>%
spread( `#COUNTERPART_AREA`, mean)
#output
`#REF_AREA` BR US
<chr> <dbl> <dbl>
1 AU 49.8 845.
Or if you insist on a matrix
df_DOTS %>%
group_by(`#COUNTERPART_AREA`, `#REF_AREA`) %>%
summarise(mean = mean(as.numeric(`#OBS_VALUE`))) %>%
spread( `#COUNTERPART_AREA`, mean) %>%
column_to_rownames("#REF_AREA") %>%
as.matrix
#output
BR US
AU 49.84293 845.1066

From the input data, we could loop over with map, pluck the elements that is needed, convert to numeric, get the mean, and convert to a two column tibble with enframe
library(purrr)
library(tidyr)
map(df_DOTS, ~ .x %>%
pluck("Obs", 1, "#OBS_VALUE") %>%
as.numeric %>%
mean) %>%
enframe %>%
unnest(c(value))
# A tibble: 2 x 2
# name value
# <chr> <dbl>
#1 BR 49.8
#2 US 845.

Another option would be like this:
tmp <- df_DOTS %>%
as_tibble() %>%
summarise(across(everything(), ~mean(as.numeric(.x$Obs[[1]]$`#OBS_VALUE`))))
tmp
# # A tibble: 1 x 2
# BR US
# <dbl> <dbl>
# 1 49.8 845.

Related

Perform a series of mutations to columns in dataframe

I am trying to replace some text in my dataframe (a few rows given below)
> dput(Henry.longer[1:4,])
structure(list(N_l = c(4, 4, 4, 4), UG = c("100", "100", "100",
"100"), S = c(12, 12, 12, 12), Sample = c(NA, NA, NA, NA), EQ = c("Henry",
"Henry", "Henry", "Henry"), DF = c(0.798545454545455, 0.798545454545455,
0.798545454545455, 0.798545454545455), meow = c("Henry.Exterior.single",
"Multi", "Henry.Exterior.multi", "Henry.Interior.single"), Girder = c("Henry.Exterior.single",
"Henry.Interior.multi", "Henry.Exterior.multi", "Interior")), row.names = c(NA,
-4L), groups = structure(list(UG = "100", S = 12, .rows = list(
1:4)), row.names = c(NA, -1L), class = c("tbl_df", "tbl",
"data.frame"), .drop = FALSE), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
I try to mutate the dataframe as:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.multi", "Multi")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.multi", "Multi")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.multi", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.single", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.multi", "Interior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.single", "Interior")) %>%
select(-meow)
But for some reason the results does not get applied to all the rows and only:
N_l UG S Sample EQ DF Loading Girder
1 4 100 12 NA Henry 0.799 Henry.Exterior.single Henry.Exterior.single
2 4 100 12 NA Henry 0.799 Multi Henry.Interior.multi
3 4 100 12 NA Henry 0.799 Henry.Exterior.multi Henry.Exterior.multi
4 4 100 12 NA Henry 0.799 Henry.Interior.single Interior
I think we can use lookup vectors for this, if it's easy or safer to use static string lookups:
tr_vec <- c(Henry.Exterior.single = "Single", Henry.Exterior.multi = "Multi", Henry.Interior.single = "Single", Henry.Interior.multi = "Multi")
tr_vec2 <- c(Henry.Exterior.multi = "Exterior", Henry.Exterior.single = "Exterior", Henry.Interior.multi = "Interior", Henry.Interior.single = "Interior")
Henry.longer %>%
mutate(
Loading = coalesce(tr_vec[Loading], Loading),
Girder = coalesce(tr_vec2[Girder], Girder)
)
# # A tibble: 4 x 8
# # Groups: UG, S [1]
# N_l UG S Sample EQ DF Loading Girder
# <dbl> <chr> <dbl> <lgl> <chr> <dbl> <chr> <chr>
# 1 4 100 12 NA Henry 0.799 Single Exterior
# 2 4 100 12 NA Henry 0.799 Multi Interior
# 3 4 100 12 NA Henry 0.799 Multi Exterior
# 4 4 100 12 NA Henry 0.799 Single Interior
The advantage of RonakShah's regex solution is that it can very easily handle many of the types of substrings you appear to need. Regexes do carry a little risk, though, in that they may (unlikely in that answer, but) miss match.
Instead of using str_replace I guess it would be easier to extract what you want using regex.
library(dplyr)
Henry.longer %>%
mutate(Loading = sub('.*\\.', '', meow),
Girder = sub('.*\\.(\\w+)\\..*', '\\1', meow))
where
Loading - removes everything until last dot
Girder - extracts a word between two dots.
Oh boy, looks like you've got some answers here already but here's a super-simple one that uses stringr::str_extract:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_extract(meow, "single|multi")) %>%
mutate(Girder = str_extract(meow, "Interior|Exterior"))
It's worth noting that the demo data has a weird entry for meow in one column, so it didn't run perfectly on my machine:

Adding summary row to dplyr output

I found a few solutions on here but none seem to work to add a summary row to dplyr output.
#mock up data
df <- data.frame("Market" = sample(c("East", "North", "West"), 100, replace = TRUE, prob = c(0.33, 0.33, 0.34)),
"var1" = sample(c("Y", "N"), 100, replace = TRUE, prob = c(0.4, 0.6)),
"var2" = sample(c("Y", "N"), 100, replace = TRUE, prob = c(0.7, 0.3)),
"var3" = sample(c("Y", "N"), 100, replace = TRUE, prob = c(0.5, 0.5)))
Here is the code:
df_report <- df %>%
group_by(Market) %>%
filter(Market == "East" | Market == "West") %>%
summarise(n = n(),
var1_y = sum(var1 == "Y"),
var1_n = sum(var1 == "N")) %>%
mutate(total = var1_y + var1_n,
var1_y_pct = (var1_y/total),
var1_n_pct = (var1_n/total),
pct_total = total/sum(total))
Here is the output:
# A tibble: 2 x 8
Market n var1_y var1_n total var1_y_pct var1_n_pct pct_total
<fct> <int> <int> <int> <int> <dbl> <dbl> <dbl>
1 East 29 13 16 29 0.448 0.552 0.453
2 West 35 16 19 35 0.457 0.543 0.547
Here are the two solutions I tried:
Option 1
df_report %>%
add_row(Market = "Total", n = sum(n), var1_y = sum(var1_y), var1_n = sum(var1_n),
total = sum(total), var1_y_pct = sum(var1_y_pct), var1_n_pct = sum(varn_y_pct), pct_total = sum(pct_total))
Option 2
df_report %>%
rbind(c("Total", sum(n), sum(var1_y), sum(var1_n), sum(total), sum(var1_y_pct), sum(varn_y_pct), sum(pct_total)))
Both give me the same error: Error in sum(n) : invalid 'type' (closure) of argument
I'm unable to determine why these solutions, while working for others and seeming very reasonable, are not working for me.
You should try
df_report %>% janitor::adorn_totals("row")
Which produces
Market n var1_y var1_n total var1_y_pct var1_n_pct pct_total
East 30 11 19 30 0.3666667 0.6333333 0.4285714
West 40 19 21 40 0.4750000 0.5250000 0.5714286
Total 70 30 40 70 0.8416667 1.1583333 1.0000000
The long way of doing this is going for summarise (watch out, you have a typo in var1_n_pct). Then bind the rows.
row_to_add <- df_report %>%
summarise(Market = "Total",
n = sum(n),
var1_y = sum(var1_y),
var1_n = sum(var1_n),
total = sum(total),
var1_y_pct = sum(var1_y_pct),
var1_n_pct = sum(var1_n_pct),
pct_total = sum(pct_total))
df_report %>% bind_rows(row_to_add)

Replacing missing values

Let's say I have a dataframe containing the sales for some quarters, while the values for the following quarters are missing. I would like to replace the NAs by a simple formula (with mutate/dplyr like below). The issue is that I don't want to use mutate so many times. How could I do that for all NAs at the same time? Is there a way?
structure(list(Period = c("1999Q1", "1999Q2", "1999Q3", "1999Q4",
"2000Q1", "2000Q2", "2000Q3", "2000Q4", "2001Q1", "2001Q2", "2001Q3",
"2001Q4", "2002Q1", "2002Q2", "2002Q3", "2002Q4", "2003Q1", "2003Q2",
"2003Q3", "2003Q4"), Sales= c(353.2925571, 425.9299841, 357.5204626,
363.80247, 302.8081066, 394.328576, 435.15573, 387.99768, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-20L))
test %>%
mutate(Sales = ifelse(is.na(Sales), 1.05*lag(Sales, 4), Sales)) %>%
mutate(Sales = ifelse(is.na(Sales), 1.05*lag(Sales, 4), Sales)) %>%
mutate(Sales = ifelse(is.na(Sales), 1.05*lag(Sales, 4), Sales))
One dplyr and tidyr possibility could be:
df %>%
group_by(quarter = substr(Period, 5, 6)) %>%
mutate(Sales_temp = replace_na(Sales, last(na.omit(Sales)))) %>%
group_by(quarter, na = is.na(Sales)) %>%
mutate(constant = 1.05,
Sales_temp = Sales_temp * cumprod(constant),
Sales = coalesce(Sales, Sales_temp)) %>%
ungroup() %>%
select(1:2)
Period Sales
<chr> <dbl>
1 1999Q1 353.
2 1999Q2 426.
3 1999Q3 358.
4 1999Q4 364.
5 2000Q1 303.
6 2000Q2 394.
7 2000Q3 435.
8 2000Q4 388.
9 2001Q1 318.
10 2001Q2 414.
11 2001Q3 457.
12 2001Q4 407.
13 2002Q1 334.
14 2002Q2 435.
15 2002Q3 480.
16 2002Q4 428.
17 2003Q1 351.
18 2003Q2 456.
19 2003Q3 504.
20 2003Q4 449.
Or with just dplyr:
df %>%
group_by(quarter = substr(Period, 5, 6)) %>%
mutate(Sales_temp = if_else(is.na(Sales), last(na.omit(Sales)), Sales)) %>%
group_by(quarter, na = is.na(Sales)) %>%
mutate(constant = 1.05,
Sales_temp = Sales_temp * cumprod(constant),
Sales = coalesce(Sales, Sales_temp)) %>%
ungroup() %>%
select(1:2)
x <- test$Sales
# find that last non-NA data
last.valid <- tail(which(!is.na(x)),1)
# store the "base"
base <- ceiling(last.valid/4)*4 + (-3:0)
base <- base + ifelse(base > last.valid, -4, 0)
base <- x[base]
# calculate the "exponents"
expos <- ceiling( ( seq(length(x)) - last.valid ) / 4 )
test$Sales <- ifelse(is.na(x), bases * 1.05 ^ expos, x)
tail(test)
# Period Sales
# 15 2002Q3 479.7592
# 16 2002Q4 427.7674
# 17 2003Q1 350.5382
# 18 2003Q2 456.4846
# 19 2003Q3 503.7472
# 20 2003Q4 449.1558
Here's another base solution:
non_nas <- na.omit(test$Sales)
nas <- length(attr(non_nas, 'na.action'))
test$Sales <- c(non_nas, #keep non_nas
tail(non_nas, 4) * 1.05 ^(rep(1:floor(nas / 4), each = 4, length.out = nas)))
test

Transpose dplyr::tbl object

I am using src_postgres to connect and dplyr::tbl function to fetch data from redshift database. I have applied some filters and top function to it using the dplyr itself. Now my data looks as below:
riid day hour
<dbl> <chr> <chr>
1 5542. "THURSDAY " 12
2 5862. "FRIDAY " 15
3 5982. "TUESDAY " 15
4 6022. WEDNESDAY 16
My final output should be as below:
riid MON TUES WED THUR FRI SAT SUN
5542 12
5862 15
5988 15
6022 16
I have tried spread. It throws the below error because of the class type:
Error in UseMethod("spread_") : no applicable method for 'spread_'
applied to an object of class "c('tbl_dbi', 'tbl_sql', 'tbl_lazy',
'tbl')"
Since this is a really big table, I do not want to use dataframe as it takes a longer time.
I was able to use as below:
df_mon <- df2 %>% filter(day == 'MONDAY') %>% mutate(MONDAY = hour) %>% select(riid,MONDAY)
df_tue <- df2 %>% filter(day == 'TUESDAY') %>% mutate(TUESDAY = hour) %>% select(riid,TUESDAY)
df_wed <- df2 %>% filter(day == 'WEDNESDAY') %>% mutate(WEDNESDAY = hour) %>% select(riid,WEDNESDAY)
df_thu <- df2 %>% filter(day == 'THURSDAY') %>% mutate(THURSDAY = hour) %>% select(riid,THURSDAY)
df_fri <- df2 %>% filter(day == 'FRIDAY') %>% mutate(FRIDAY = hour) %>% select(riid,FRIDAY)
Is it possible to write all above in one statement?
Any help to transpose this in a faster manner is really appreciated.
EDIT
Adding the dput of the tbl object:
structure(list(src = structure(list(con = <S4 object of class structure("PostgreSQLConnection", package = "RPostgreSQL")>,
disco = <environment>), .Names = c("con", "disco"), class = c("src_dbi",
"src_sql", "src")), ops = structure(list(name = "select", x = structure(list(
name = "filter", x = structure(list(name = "filter", x = structure(list(
name = "group_by", x = structure(list(x = structure("SELECT riid,day,hour,sum(weightage) AS score FROM\n (SELECT riid,day,hour,\n POWER(2,(cast(datediff (seconds,convert_timezone('UTC','PKT',SYSDATE),TO_DATE(TO_CHAR(event_captured_dt,'mm/dd/yyyy hh24:mi:ss'),'mm/dd/yyyy hh24:mi:ss')) as decimal) / cast(7862400 as decimal))) AS weightage\n FROM (\n SELECT riid,convert_timezone('GMT','PKT',event_captured_dt) AS EVENT_CAPTURED_DT,\n TO_CHAR(convert_timezone('GMT','PKT',event_captured_dt),'DAY') AS day,\n TO_CHAR(convert_timezone('GMT','PKT',event_captured_dt),'HH24') AS hour\n FROM Zameen_STO_DATA WHERE EVENT_CAPTURED_DT >= TO_DATE((sysdate -30),'yyyy-mm-dd') and LIST_ID = 4282\n )) group by riid,day,hour", class = c("sql",
"character")), vars = c("riid", "day", "hour", "score"
)), .Names = c("x", "vars"), class = c("op_base_remote",
"op_base", "op")), dots = structure(list(riid = riid,
day = day), .Names = c("riid", "day")), args = structure(list(
add = FALSE), .Names = "add")), .Names = c("name",
"x", "dots", "args"), class = c("op_group_by", "op_single",
"op")), dots = structure(list(~min_rank(desc(~score)) <=
1), .Names = ""), args = list()), .Names = c("name",
"x", "dots", "args"), class = c("op_filter", "op_single",
"op")), dots = structure(list(~row_number() == 1), .Names = ""),
args = list()), .Names = c("name", "x", "dots", "args"), class = c("op_filter",
"op_single", "op")), dots = structure(list(~riid, ~day, ~hour), class = "quosures", .Names = c("",
"", "")), args = list()), .Names = c("name", "x", "dots", "args"
), class = c("op_select", "op_single", "op"))), .Names = c("src",
"ops"), class = c("tbl_dbi", "tbl_sql", "tbl_lazy", "tbl"))
I think what you're looking for is the ability to run the tidyr::spread() function against a remote source, or database. I have a PR for dbplyr that attempts to implement that here: https://github.com/tidyverse/dbplyr/pull/72, you can try it out by using: devtools::install_github("tidyverse/dbplyr", ref = devtools::github_pull(72)).
Use dcast from reshape2 package
> data
# A tibble: 4 x 3
riid day hour
<dbl> <chr> <dbl>
1 1.00 TH 12.0
2 2.00 FR 15.0
3 3.00 TU 15.0
4 4.00 WE 16.0
> dcast(data, riid~day, value.var = "hour")
riid FR TH TU WE
1 1 NA 12 NA NA
2 2 15 NA NA NA
3 3 NA NA 15 NA
4 4 NA NA NA 16
Further if you want to remove NA, then
> z <- dcast(data, riid~day, value.var = "hour")
> z[is.na(z)] <- ""
> z
riid FR TH TU WE
1 1 12
2 2 15
3 3 15
4 4 16
I tried to combine your multiple line attempts into one. Can you try this and let us know the outcome?
library(dplyr)
df %>%
rowwise() %>%
mutate(Mon = ifelse(day=='MONDAY', hour[day=='MONDAY'], NA),
Tue = ifelse(day=='TUESDAY', hour[day=='TUESDAY'], NA),
Wed = ifelse(day=='WEDNESDAY', hour[day=='WEDNESDAY'], NA),
Thu = ifelse(day=='THURSDAY', hour[day=='THURSDAY'], NA),
Fri = ifelse(day=='FRIDAY', hour[day=='FRIDAY'], NA),
Sat = ifelse(day=='SATURDAY', hour[day=='SATURDAY'], NA),
Sun = ifelse(day=='SUNDAY', hour[day=='SUNDAY'], NA)) %>%
select(-day, -hour)
Output is:
riid Mon Tue Wed Thu Fri Sat Sun
1 5542 NA NA NA 12 NA NA NA
2 5862 NA NA NA NA 15 NA NA
3 5982 NA 15 NA NA NA NA NA
4 6022 NA NA 16 NA NA NA NA
Sample data:
# A tibble: 4 x 3
riid day hour
* <dbl> <chr> <int>
1 5542 THURSDAY 12
2 5862 FRIDAY 15
3 5982 TUESDAY 15
4 6022 WEDNESDAY 16
Update:
Can you try below approach using data.table?
library(data.table)
dt <- setDT(df)[, c("Mon","Tue","Wed","Thu","Fri","Sat","Sun") :=
list(ifelse(day=='MONDAY', hour[day=='MONDAY'], NA),
ifelse(day=='TUESDAY', hour[day=='TUESDAY'], NA),
ifelse(day=='WEDNESDAY', hour[day=='WEDNESDAY'], NA),
ifelse(day=='THURSDAY', hour[day=='THURSDAY'], NA),
ifelse(day=='FRIDAY', hour[day=='FRIDAY'], NA),
ifelse(day=='SATURDAY', hour[day=='SATURDAY'], NA),
ifelse(day=='SUNDAY', hour[day=='SUNDAY'], NA))][, !c("day","hour"), with=F]

how to assign words to a number in a dataframe

I have a below dataframe with numbers in two of the columns and I should replace that with string using my other reference dataset.
Dataset 1:
lhs rhs
32,39,6 65
39,6,65 32
14,16,26 15
16,20,4 26
16,26,33 4
53 31
Dataset 2:
id name
4 yougurt
6 coffee
14 cream chese
15 meat spreads
16 butter
20 whole milk
26 condensed milk
31 curd
32 flour
39 rolls
53 sugar
65 soda
Expected output:
lhs rhs
flour, rolls, coffee soda
rolls, coffee, soda flour
cream chease, butter, condensed milk meat spreads
A solution using dplyr and tidyr. dat is the final output. The key is to use separate_rows to expand the lhs and then conduct left_join twice.
library(dplyr)
library(tidyr)
dat <- dat1 %>%
separate_rows(lhs, convert = TRUE) %>%
left_join(dat2, by = c("lhs" = "id")) %>%
left_join(dat2, by = c("rhs" = "id")) %>%
drop_na(name.x) %>%
group_by(name.y) %>%
summarise(lhs = paste0(name.x, collapse = ", ")) %>%
ungroup() %>%
select(lhs, rhs = name.y)
dat
# # A tibble: 6 x 2
# lhs rhs
# <chr> <chr>
# 1 butter, whole milk, yougurt condensed milk
# 2 sugar curd
# 3 rolls, coffee, soda flour
# 4 cream chese, butter, condensed milk meat spreads
# 5 flour, rolls, coffee soda
# 6 butter, condensed milk yougurt
DATA
dat1 <- read.table(text = "lhs rhs
'32,39,6' 65
'39,6,65' 32
'14,16,26' 15
'16,20,4' 26
'16,26,33' 4
53 31 ",
stringsAsFactors = FALSE, header = TRUE)
dat2 <- read.table(text = "id name
4 yougurt
6 coffee
14 'cream chese'
15 'meat spreads'
16 butter
20 'whole milk'
26 'condensed milk'
31 curd
32 flour
39 rolls
53 sugar
65 soda",
header = TRUE, stringsAsFactors = FALSE)
Another option. Here d1 is your first data frame and d2 your second.
library(tidyverse)
d1 %>% separate(lhs, sep = ',', into = c('v1', 'v2', 'v3')) %>%
mutate_all(as.numeric) %>%
left_join(d2, by = c('v1'='id')) %>%
left_join(d2, by = c('v2'='id')) %>%
left_join(d2, by = c('v3'='id')) %>%
left_join(d2, by = c('rhs'='id')) %>%
unite(lhs, name.x, name.y, name.x.x, sep = ',') %>%
mutate(lhs = str_replace_all(lhs, ',NA', '')) %>%
select(lhs, rhs = name.y.y)
OR, as pointed out by #Moody_Mudskipper in the comments
d1 %>% separate(lhs, sep = ',', into = c('v1', 'v2', 'v3')) %>%
mutate_all(as.numeric) %>%
lmap(~setNames(left_join(setNames(.x, "id"), d2)[2], names(.x))) %>%
unite(lhs, v1, v2, v3, sep = ', ') %>%
mutate(lhs = str_replace_all(lhs, ',NA', '')) %>%
select(lhs, rhs = name.y.y)
lhs rhs
1 flour, rolls, coffee soda
2 rolls, coffee, soda flour
3 cream chese, butter, condensed milk meat spreads
4 butter, whole milk, yougurt condensed milk
5 butter, condensed milk yougurt
6 sugar curd
This is almost the same as www, but appears to be a little faster. Apparently using strsplit and unnest is faster than separate_rows
require(tidyverse)
df1 %>%
mutate(lhs = sapply(lhs, strsplit, ',')) %>%
unnest %>%
mutate_at(c('lhs', 'rhs'), as.numeric) %>%
left_join(df2, by = c('lhs'= 'id')) %>%
left_join(df2, by = c('rhs'= 'id')) %>%
group_by(name.y) %>%
summarize(name.x = paste(name.x, collapse = ', ')) %>%
rename(rhs = name.y, lhs = name.x)
Then there's the data.table solution, which is much faster.
require(data.table)
setDT(df1)
df1[, .(lhs = unlist(strsplit(lhs, ','))), rhs] %>%
.[, lapply(.SD, as.numeric)] %>%
merge(df2, by.x = 'lhs', by.y = 'id') %>%
merge(df2, by.x = 'rhs', by.y = 'id') %>%
.[, .(lhs = paste0(name.x, collapse = ',')), by = .(rhs = name.y)]
Benchmark
# Results
# Unit: relative
# expr min lq mean median uq max neval
# useDT() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 300
# UseUnnest() 5.570704 5.632532 5.274552 5.374714 5.042518 9.254190 300
# UseSeparateRows() 8.640615 8.356889 7.661669 7.939593 7.401666 7.896038 300
# Method
require(tidyverse)
require(data.table)
df1 <- fread("
lhs rhs
32,39,6 65
39,6,65 32
14,16,26 15
16,20,4 26
16,26,33 4
53 31
")
df2 <- fread("
id name
4 yougurt
6 coffee
14 cream_chese
15 meat_spreads
16 butter
20 whole_milk
26 condensed_milk
31 curd
32 flour
39 rolls
53 sugar
65 soda
")
useDT <- function(x){
df1[, lapply(sapply(lhs, strsplit, ','), unlist), rhs] %>%
setNames(c('rhs', 'lhs')) %>%
.[, `:=`(lhs = as.numeric(lhs),
rhs = as.numeric(rhs))] %>%
.[df2, on = c('lhs'= 'id')] %>%
.[df2, on = c('rhs'= 'id')] %>%
.[, .(lhs = paste0(name, collapse = ',')), by = i.name] %>%
.[lhs != 'NA', .(lhs, rhs = i.name)]
}
UseUnnest <- function(x){
df1 %>%
mutate(lhs = sapply(lhs, strsplit, ',')) %>%
unnest %>%
mutate_at(c('lhs', 'rhs'), as.numeric) %>%
left_join(df2, by = c('lhs'= 'id')) %>%
left_join(df2, by = c('rhs'= 'id')) %>%
group_by(name.y) %>%
summarize(name.x = paste(name.x, collapse = ', ')) %>%
rename(rhs = name.y, lhs = name.x)
}
UseSeparateRows <- function(x){
df1 %>%
separate_rows(lhs, convert = TRUE) %>%
left_join(df2, by = c("lhs" = "id")) %>%
left_join(df2, by = c("rhs" = "id")) %>%
drop_na(name.x) %>%
group_by(name.y) %>%
summarise(lhs = paste0(name.x, collapse = ", ")) %>%
ungroup() %>%
select(lhs, rhs = name.y)
}
microbenchmark(useDT(), UseUnnest(), UseSeparateRows(), times = 300, unit = 'relative')
Here is an option using just base R and mapping the numeric values to factor labels.
Split the string, map the labels to the values and then collapse the labels back into a string.
df<-structure(list(id = c(4L, 6L, 14L, 15L, 16L, 20L, 26L, 31L, 32L,
39L, 53L, 65L), name = c("yougurt", "coffee", "cream cheese",
"meat spreads", "butter", "whole milk", "condensed milk", "curd",
"flour", "rolls", "sugar", "soda")), .Names = c("id", "name"),
class = "data.frame", row.names = c(NA, -12L))
input<-structure(list(lhs = c("32,39,6", "39,6,65", "14,16,26", "16,20,4",
"16,26,33", "53"), rhs = c(65L, 32L, 15L, 26L, 4L, 31L)),
.Names = c("lhs", "rhs"), class = "data.frame", row.names = c(NA, -6L))
#new left hand side
newlhs<-sapply(as.character(input$lhs), function(x){
strs<-unlist(strsplit(x, ","))
f<-factor(strs, levels=df$id, labels=df$name)
paste(f, collapse = ", ")
})
#new right hand side
newrhs<-sapply(as.character(input$rhs), function(x){
strs<-unlist(strsplit(x, ","))
f<-factor(strs, levels=df$id, labels=df$name)
paste(f, collapse = ", ")
})
answer<-data.frame(newlhs, newrhs)
row.names(answer)<-NULL #remove rownames
Not so idiomatic but I win the code golf :) :
as.data.frame(lapply(dat1, function(x){
for (i in seq(nrow(dat2))) x <- gsub(paste0("(^|,)",dat2$id[i],"(,|$)"),
paste0("\\1",dat2$name[i],"\\2"),x)
x}))
# lhs rhs
# 1 flour,rolls,coffee soda
# 2 rolls,coffee,soda flour
# 3 cream chese,butter,condensed milk meat spreads
# 4 butter,whole milk,yougurt condensed milk
# 5 butter,condensed milk,33 yougurt
# 6 sugar curd
May fail if you have numbers in 2nd dataset.

Resources