Related
Hi I am a bit new so I am not sure if I am doing this right, but I looked around on the overflow and couldn't find a code or advice that worked with my code.
I have a dataframe mainDF that looks like this:
Person
ABG
SEP
CLC
XSP
APP
WED
GSH
SP-1
2.1
3.0
1.3
1.8
1.4
2.5
1.4
SP-2
2.5
2.1
2.0
1.9
1.2
1.2
2.1
SP-3
2.3
3.1
2.5
1.5
1.1
2.6
2.1
I have another dataframe, TranslateDF that has the converting info for the abbreviated column names. And I want to replace the abbreviated names with the real names here:
Do note that the translating data frame may have extraneous info or it could be missing info for the column, and so if the mainDF does not get the full naming, for it to be dropped from the data.
Abbreviated
Full Naming
ABG
All barbecue grill
SEP
shake eel peel
CLC
cold loin cake
XSP
xylophone spear pint
APP
apple pot pie
HUM
hall united meat
LPL
lending porkloin
Ideally, I would get the new resulted dataframe as:
Person
All barbecue grill
shake eel peel
cold loin cake
xylophone spear pint
apple pot pie
SP-1
2.1
3.0
1.3
1.8
1.4
SP-2
2.5
2.1
2.0
1.9
1.2
SP-3
2.3
3.1
2.5
1.5
1.1
I would appreciate any help on this thank you!
You can pass a named vector to select() which will rename and select in one step. Wrapping with any_of() ensures it won't fail if any columns don't exist in the main data frame:
library(dplyr)
df1 %>%
select(Person, any_of(setNames(df2$Abbreviated, df2$Full_Naming)))
# A tibble: 3 x 6
Person `All barbecue grill` `shake eel peel` `cold loin cake` `xylophone spear pint` `apple pot pie`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 SP-1 2.1 3 1.3 1.8 1.4
2 SP-2 2.5 2.1 2 1.9 1.2
3 SP-3 2.3 3.1 2.5 1.5 1.1
Data:
df1 <- structure(list(Person = c("SP-1", "SP-2", "SP-3"), ABG = c(2.1,
2.5, 2.3), SEP = c(3, 2.1, 3.1), CLC = c(1.3, 2, 2.5), XSP = c(1.8,
1.9, 1.5), APP = c(1.4, 1.2, 1.1), WED = c(2.5, 1.2, 2.6), GSH = c(1.4,
2.1, 2.1)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -3L), spec = structure(list(cols = list(
Person = structure(list(), class = c("collector_character",
"collector")), ABG = structure(list(), class = c("collector_double",
"collector")), SEP = structure(list(), class = c("collector_double",
"collector")), CLC = structure(list(), class = c("collector_double",
"collector")), XSP = structure(list(), class = c("collector_double",
"collector")), APP = structure(list(), class = c("collector_double",
"collector")), WED = structure(list(), class = c("collector_double",
"collector")), GSH = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
df2 <- structure(list(Abbreviated = c("ABG", "SEP", "CLC", "XSP", "APP",
"HUM", "LPL"), Full_Naming = c("All barbecue grill", "shake eel peel",
"cold loin cake", "xylophone spear pint", "apple pot pie", "hall united meat",
"lending porkloin")), class = "data.frame", row.names = c(NA,
-7L))
How about this:
mainDF <- structure(list(Person = c("SP-1", "SP-2", "SP-3"), ABG = c(2.1,
2.5, 2.3), SEP = c(3, 2.1, 3.1), CLC = c(1.3, 2, 2.5), XSP = c(1.8,
1.9, 1.5), APP = c(1.4, 1.2, 1.1), WED = c(2.5, 1.2, 2.6), GSH = c(1.4,
2.1, 2.1)), row.names = c(NA, 3L), class = "data.frame")
translateDF <- structure(list(Abbreviated = c("ABG", "SEP", "CLC", "XSP", "APP",
"HUM", "LPL"), `Full Naming` = c("All barbecue grill", "shake eel peel",
"cold loin cake", "xylophone spear pint", "apple pot pie", "hall united meat",
"lending porkloin")), row.names = c(NA, 7L), class = "data.frame")
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyr)
mainDF %>%
pivot_longer(-Person,
names_to="Abbreviated",
values_to = "vals") %>%
left_join(translateDF) %>%
select(-Abbreviated) %>%
na.omit() %>%
pivot_wider(names_from=`Full Naming`, values_from="vals")
#> Joining, by = "Abbreviated"
#> # A tibble: 3 × 6
#> Person `All barbecue grill` `shake eel peel` `cold loin cake` `xylophone spe…`
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 SP-1 2.1 3 1.3 1.8
#> 2 SP-2 2.5 2.1 2 1.9
#> 3 SP-3 2.3 3.1 2.5 1.5
#> # … with 1 more variable: `apple pot pie` <dbl>
Created on 2022-04-24 by the reprex package (v2.0.1)
library(tidyverse)
mainDF %>%
rename_with(~str_replace_all(., set_names(TranslateDF[, 2], TranslateDF[, 1]))) %>%
select(Person, which(!(names(.) %in% names(mainDF))))
Person All barbecue grill shake eel peel cold loin cake xylophone spear pint apple pot pie
1 SP-1 2.1 3.0 1.3 1.8 1.4
2 SP-2 2.5 2.1 2.0 1.9 1.2
3 SP-3 2.3 3.1 2.5 1.5 1.1
Say we have two different datasets:
Dataset A:
ids name price
1234 bread 1.5
245r7 butter 1.2
123984 red wine 5
43498 beer 1
235897 cream 1.8
Dataset B:
ids name price
24908 lait 1
1234,089 pain 1.7
77289,43498 bière 1.5
245r7 beurre 1.4
My goal is to match all the products sharing at least one ID and bring them together into a new dataset that should look as follows:
id a_name b_name a_price b_price
1234 bread pain 1.5 1.7
245r7 butter beurre 1.2 1.4
43498 beer bière 1 1.5
Is this feasible using stringr or any other R package?
You can create a long dataset with separate_rows and then do a join.
library(dplyr)
library(tidyr)
B %>%
separate_rows(ids, sep = ',') %>%
inner_join(A, by = 'ids')
# ids name.x price.x name.y price.y
# <chr> <chr> <dbl> <chr> <dbl>
#1 1234 pain 1.7 bread 1.5
#2 43498 bière 1.5 beer 1
#3 245r7 beurre 1.4 butter 1.2
We can use the sqldf package here:
library(sqldf)
sql <- "SELECT a.ids AS id, a.name AS a_name, b.name AS b_name, a.price AS a_price,
b.price AS b_price
FROM df_a a
INNER JOIN df_b b
ON ',' || b.ids || ',' LIKE '%,' || a.ids || ',%'"
output <- sqldf(sql)
As separate_rows (my favorite) is already provided by Ronak Shah,
Here is another strategy using strsplit and unnest():
library(tidyr)
library(dplyr)
df_B %>%
mutate(ids = strsplit(as.character(ids), ",")) %>%
unnest() %>%
inner_join(df_A, by="ids")
ids name.x price.x name.y price.y
<chr> <chr> <dbl> <chr> <chr>
1 1234 pain 1.7 bread 1.5
2 43498 bi??re 1.5 beer 1
3 245r7 beurre 1.4 butter 1.2
data:
df_A <- structure(list(ids = c("1234", "245r7", "123984", "43498", "235897"
), name = c("bread", "butter", "red", "beer", "cream"), price = c("1.5",
"1.2", "wine", "1", "1.8")), class = c("spec_tbl_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -5L), problems = structure(list(
row = 3L, col = NA_character_, expected = "3 columns", actual = "4 columns",
file = "'test'"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")))
df_B <- structure(list(ids = c("24908", "1234,089", "77289,43498", "245r7"
), name = c("lait", "pain", "bi??re", "beurre"), price = c(1,
1.7, 1.5, 1.4)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -4L))
I have a dataframe that looks like this (but for every US county)
county
state
n_state_1
n_state_2
n_state_3
n_state_4
Autauga County
AL
NA
FL
NA
NA
Baldwin County
AL
GA
NA
TN
NA
Catron County
AL
FL
GA
NA
CA
I want to move the non-missing values (FL,GA,TN etc.) to the first columns starting from n_state_1 and then delete the columns containing only missing values to get:
county
state
n_state_1
n_state_2
n_state_3
Autauga County
AL
FL
NA
NA
Baldwin County
AL
GA
TN
NA
Catron County
AL
FL
GA
CA
I am struggling with the first step. I thought about using the function distinct but it doesn't work because there are non-empty elements in each column.
You could use dplyr and tidyr:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(starts_with("n_state")) %>%
drop_na() %>%
group_by(county, state) %>%
mutate(name=row_number()) %>%
pivot_wider(names_prefix="n_state_")
which returns
county state n_state_1 n_state_2 n_state_3
<chr> <chr> <chr> <chr> <chr>
1 Autauga_County AL FL NA NA
2 Baldwin_County AL GA TN NA
3 Catron_County AL FL GA CA
What happened here?
pivot_longer takes the n_state_{n}-columns and collapses them into two columns: the name-column contains the original column name (n_state_1, n_state_2 etc), the value-column contains the states (FL, GA or <NA> in many cases).
Next we remove every <NA> entry. (Note: I use <NA> to make clear it's an NA-value).)
After a grouping by county and state we add a rownumber. These numbers will be later used to create the new column names.
pivot_wider now takes these row numbers and prefixes them with n_state_ to get the new columns. The values are taken from the value-column created in the second line of code. pivot_wider fills the missing values with <NA>-values (default behaviour).
Data
structure(list(county = c("Autauga_County", "Baldwin_County",
"Catron_County"), state = c("AL", "AL", "AL"), n_state_1 = c(NA,
"GA", "FL"), n_state_2 = c("FL", NA, "GA"), n_state_3 = c(NA,
"TN", NA), n_state_4 = c(NA, NA, "CA")), problems = structure(list(
row = 3L, col = "n_state_4", expected = "", actual = "embedded null",
file = "literal data"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")), class = c("spec_tbl_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -3L), spec = structure(list(
cols = list(county = structure(list(), class = c("collector_character",
"collector")), state = structure(list(), class = c("collector_character",
"collector")), n_state_1 = structure(list(), class = c("collector_character",
"collector")), n_state_2 = structure(list(), class = c("collector_character",
"collector")), n_state_3 = structure(list(), class = c("collector_character",
"collector")), n_state_4 = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
Or another option with dapply from collapse and select only columns with any non-NA elements
library(collapse)
library(dplyr)
dapply(df1, MARGIN = 1, FUN = function(x) c(x[!is.na(x)], x[is.na(x)])) %>%
select(where(~ any(complete.cases(.))))
# A tibble: 3 x 5
county state n_state_1 n_state_2 n_state_3
<chr> <chr> <chr> <chr> <chr>
1 Autauga_County AL FL <NA> <NA>
2 Baldwin_County AL GA TN <NA>
3 Catron_County AL FL GA CA
I have a df attached and I would like to create a loop that would apply a specific sequence (set by the user in R) based on conditions in column "x9". I would like to be able to set the sequence myself so I can try different sequences for this data frame, I will explain more below.
I have a df of losses and wins for an algorithm. On the first instance of a win I want to take the value in "x9" and divide it by the sequence value. I want to keep iterating through the sequence values until a loss is achieved. Once a loss is achieved the sequence will restart, when "x9" <0 to be specific.
I would like to create the two columns in my example "Risk Control" and "Sequence". Ideally I would like the function to iterate through the entire data frame so I can compare the column "x9" to "Risk Control".
Sample Data:
structure(list(x1 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), x2 = c("2016.01.04 01:05",
"2016.01.04 01:12", "2016.01.04 01:13", "2016.01.04 01:17", "2016.01.04 01:20",
"2016.01.04 01:23", "2016.01.04 01:25", "2016.01.04 01:30", "2016.01.04 01:31",
"2016.01.04 01:59"), x3 = c("buy", "close", "buy", "close", "buy",
"close", "buy", "t/p", "buy", "close"), x4 = c(1, 1, 2, 2, 3,
3, 4, 4, 5, 5), x5 = c(8.46, 8.46, 8.6, 8.6, 8.69, 8.69, 8.83,
8.83, 9, 9), x6 = c(1.58873, 1.58955, 1.5887, 1.58924, 1.58862,
1.58946, 1.58802, 1.58902, 1.58822, 1.58899), x7 = c(1.57873,
1.57873, 1.5787, 1.5787, 1.57862, 1.57862, 1.57802, 1.57802,
1.57822, 1.57822), x8 = c(1.58973, 1.58973, 1.5897, 1.5897, 1.58962,
1.58962, 1.58902, 1.58902, 1.58922, 1.58922), x9 = c(0, 478.69,
0, 320.45, 0, 503.7, 0, 609.3, 0, 478.19), x10 = c(30000, 30478.69,
30478.69, 30799.14, 30799.14, 31302.84, 31302.84, 31912.14, 31912.14,
32390.33), `Risk Control` = c(NA, 478.69, NA, 320.45, NA, 251.85,
NA, 304.65, NA, 159.3966667), ...12 = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Sequence = c(NA, 1, NA, 1, NA, 2, NA, 2, NA,
3)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
), spec = structure(list(cols = list(x1 = structure(list(), class = c("collector_double",
"collector")), x2 = structure(list(), class = c("collector_character",
"collector")), x3 = structure(list(), class = c("collector_character",
"collector")), x4 = structure(list(), class = c("collector_double",
"collector")), x5 = structure(list(), class = c("collector_double",
"collector")), x6 = structure(list(), class = c("collector_double",
"collector")), x7 = structure(list(), class = c("collector_double",
"collector")), x8 = structure(list(), class = c("collector_double",
"collector")), x9 = structure(list(), class = c("collector_double",
"collector")), x10 = structure(list(), class = c("collector_double",
"collector")), `Risk Control` = structure(list(), class = c("collector_double",
"collector")), ...12 = structure(list(), class = c("collector_logical",
"collector")), Sequence = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"))
In short I need assistance in:
1.Constructing a sequence to apply to my df, would like to be able to alter this sequence to try different sequences;
2.Take values in "x9" and create a new column that would apply the sequence values set. The sequence is taking the value in "x9" and dividing it by the sequence number
3.Construct a loop to iterate through the entire df to apply this over all of the values of the dataframe.
In the example above I have manually created "Risk Control" and the sample "Sequence". The sequence in the example is 1,1,2,2,3,3,4. The sequence in the sample uses each number twice before iterating to the next number. Once a loss is achieved in "x9" the sequence restarts.
I would appreciate any help with this function and loop. Thank you
Starting with input data only (not desired columns)
df1 <- df %>% select(1:10)
Reducing this data to only data with x9 not zero
This may not be intended and the user may prefer to key off an x3 event, but hopefully is illustrative.
df1 <- df1 %>% filter(x9 != 0)
Initiate seq column and insert dummy data.
df1$seq <- c(1, NA, 1, NA, NA)
Fill in, thanks to Allan Cameron for this answer to my post link
df1$seq <- unlist(sapply(diff(c(which(!is.na(df1$seq)), nrow(df1) + 1)), seq))
Apply user's rule 2:
df1$risk_control <- df1$x9 / df1$seq
# A tibble: 5 x 12
x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 seq risk_control
<dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
1 2 2016.01.04 0~ close 1 8.46 1.59 1.58 1.59 479. 30479. 1 479.
2 4 2016.01.04 0~ close 2 8.6 1.59 1.58 1.59 320. 30799. 2 160.
3 6 2016.01.04 0~ close 3 8.69 1.59 1.58 1.59 504. 31303. 1 504.
4 8 2016.01.04 0~ t/p 4 8.83 1.59 1.58 1.59 609. 31912. 2 305.
5 10 2016.01.04 0~ close 5 9 1.59 1.58 1.59 478. 32390. 3 159.
Recombining this with the original data can be performed if desired with:
df2 <- dplyr::left_join(df[, -c(11:13)], df1)
# A tibble: 10 x 12
x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 seq risk_control
<dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
1 1 2016.01.04 ~ buy 1 8.46 1.59 1.58 1.59 0 30000 NA NA
2 2 2016.01.04 ~ close 1 8.46 1.59 1.58 1.59 479. 30479. 1 479.
3 3 2016.01.04 ~ buy 2 8.6 1.59 1.58 1.59 0 30479. NA NA
4 4 2016.01.04 ~ close 2 8.6 1.59 1.58 1.59 320. 30799. 2 160.
5 5 2016.01.04 ~ buy 3 8.69 1.59 1.58 1.59 0 30799. NA NA
6 6 2016.01.04 ~ close 3 8.69 1.59 1.58 1.59 504. 31303. 1 504.
7 7 2016.01.04 ~ buy 4 8.83 1.59 1.58 1.59 0 31303. NA NA
8 8 2016.01.04 ~ t/p 4 8.83 1.59 1.58 1.59 609. 31912. 2 305.
9 9 2016.01.04 ~ buy 5 9 1.59 1.58 1.59 0 31912. NA NA
10 10 2016.01.04 ~ close 5 9 1.59 1.58 1.59 478. 32390. 3 159.
I have a dataframe containing n rows and m columns. Each row is an individual and each column is information on that individual.
df
id age income
1 18 12
2 24 24
3 36 12
4 18 24
. . .
. . .
. . .
I also have a matrix rXcshowing age buckets in each row and income buckets in each column and each element of the matrix is the % of people for each income-age bucket.
matrix age\income
12 24 36 .....
18 0.15 0.12 0.11 ....
24 0.12 0.6 0.2 ...
36 0.02 0.16 0.16 ...
. ..................
. ..................
For each individual in the dataframe, I need to find the right element of the matrix given the age and income bucket of the individual.
The desired output should look like this
df2
id age income y
1 18 12 0.15
2 24 24 0.6
3 36 12 0.02
4 18 24 0.12
. . .
. . .
. . .
I tried with a series of IFs inside a loop (like in the example):
for (i in 1:length(df$x)) {
workingset <- df[i,]
if(workingset$age==18){
temp<-marix[1,]
workingset$y <- ifelse(workingset$income<12, temp[1], ifelse(workingset$income<24,temp[2],ifelse,temp[3])
}else if(workingset$age==24){
temp<-marix[2,]
workingset$y <- ifelse(workingset$income<12, temp[1], ifelse(workingset$income<24,temp[2],ifelse,temp[3])
}else if{
...
}
if(i==1){
df2 <- workingset
}else{
df2<- rbind(df2, workingset)
}
}
This code works, but it takes too long. Is there a way do this job efficiently?
Assuming your data looks exactly like shown you could use dplyr and tidyr.
First convert your matrix (I name it my_mat) into a data.frame
my_mat %>%
as.data.frame() %>%
mutate(age=rownames(.)) %>%
pivot_longer(cols=-age, names_to="income", values_to="y") %>%
mutate(across(where(is.character), as.numeric))
returns
# A tibble: 9 x 3
age income y
<dbl> <dbl> <dbl>
1 18 12 0.15
2 18 24 0.12
3 18 36 0.11
4 24 12 0.12
5 24 24 0.6
6 24 36 0.2
7 36 12 0.02
8 36 24 0.16
9 36 36 0.16
This can be left joined with your data.frame df, so in one go:
my_mat %>%
as.data.frame() %>%
mutate(age=rownames(.)) %>%
pivot_longer(cols=-age, names_to="income", values_to="y") %>%
mutate(across(where(is.character), as.numeric)) %>%
left_join(df, ., by=c("age", "income"))
gives you
# A tibble: 4 x 4
id age income y
<dbl> <dbl> <dbl> <dbl>
1 1 18 12 0.15
2 2 24 24 0.6
3 3 36 12 0.02
4 4 18 24 0.12
Data
my_mat <- structure(c(0.15, 0.12, 0.02, 0.12, 0.6, 0.16, 0.11, 0.2, 0.16
), .Dim = c(3L, 3L), .Dimnames = list(c("18", "24", "36"), c("12",
"24", "36")))
df <- structure(list(id = c(1, 2, 3, 4), age = c(18, 24, 36, 18), income = c(12,
24, 12, 24)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -4L), spec = structure(list(cols = list(
id = structure(list(), class = c("collector_double", "collector"
)), age = structure(list(), class = c("collector_double",
"collector")), income = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))