I have data as follows:
library(data.table)
dat <- structure(list(year2006 = c("1110", "1110", "1110", "1110", "1120",
"1120", "1120", "1120"), group2006 = c("1", "2", "3", "4", "1",
"2", "3", "4"), min2006 = c("1.35", "2", "3.7",
"4.25", "5.6", "4.45", "3.09", "1.13"),
year2007 = c("1110", "1110", "1110", "1110", "1120", "1120",
"1120", "1120"), group2007 = c("1", "2", "3", "4", "1",
"2", "3", "4"), min2007 = c("5", "5.05", "5",
"1.59", "2.3", "3", "4.05", "5.16"
)), row.names = c(NA, -8L), class = c("data.table", "data.frame"
))
dat
year2006 group2006 min2006 year2007 group2007 min2007
1: 1110 1 1.35 1110 1 5
2: 1110 2 2 1110 2 5.05
3: 1110 3 3.7 1110 3 5
4: 1110 4 4.25 1110 4 1.59
5: 1120 1 5.6 1120 1 2.3
6: 1120 2 4.45 1120 2 3
7: 1120 3 3.09 1120 3 4.05
8: 1120 4 1.13 1120 4 5.16
What I would like to do, is to create a list of the numbers in min200x, per category in year200x.
Desired output:
cat year2006 year2007
1: 1110 c("1.35", "2", "3.7", "4.25") c("5", "5.05", "5", "1.59")
2: 1120 c("5.6", "4.45", "3.09", "1.13") c("2.3", "3", "4.05", "5.16")
I thought I could do something like:
setDT(dat)[, cat := list(min2006), by=year2006]
But that does not work (it just puts the min2006 item in a new colum cat). And even if it did, it would only provide a solution for the year 2006. How should I go about this?
I'm not sure why your columns in your test data are all character but the columns in your desired output are numeric. Also, you ask for a list of numbers by group but your expected output shows a vector.
Nevertheless, here's a tidyverse solution that creates list columns.
library(tidyverse)
x <- dat %>%
mutate(across(everything(), as.numeric)) %>%
group_by(year2006) %>%
select(year2006, starts_with("min")) %>%
summarise(across(everything(), lst))
x
# A tibble: 2 × 3
year2006 min2006 min2007
<dbl> <named list> <named list>
1 1110 <dbl [4]> <dbl [4]>
2 1120 <dbl [4]> <dbl [4]>
and, for example,
x$min2006
$min2006
[1] 1.35 2.00 3.70 4.25
$min2006
[1] 5.60 4.45 3.09 1.13
If your inputs are actually numeric, you can lose the mutate.
Edit
... and to get the correct name for the grouping column, you can add %>% rename(cat=year2006) to the pipe. Apologies for the omission.
a similar approach
data.table
library(data.table)
COLS <- grep(names(df), pattern = "^min", value = TRUE)
setDT(df)[, lapply(.SD, list), .SDcol = COLS, by = year2006]
#> year2006 min2006 min2007
#> 1: 1110 1.35,2,3.7,4.25 5,5.05,5,1.59
#> 2: 1120 5.6,4.45,3.09,1.13 2.3,3,4.05,5.16
Created on 2022-05-31 by the reprex package (v2.0.1)
Here is also a base R solution,
l1 <- lapply(split.default(dat, gsub('\\D+', '', names(dat))), function(i)
aggregate(as.matrix(i[3]) ~ as.matrix(i[1]), i, list))
do.call(cbind, l1)[-3]
# year2006 2006.min2006 2007.min2007
#1 1110 1.35, 2, 3.7, 4.25 5, 5.05, 5, 1.59
#2 1120 5.6, 4.45, 3.09, 1.13 2.3, 3, 4.05, 5.16
Related
I have a long-form dataframe, with a column (B) including the absolute successive differences between values in column (A), for each individual's ID separately.
ID = c("1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2")
A = c("120", "115", "125", "119", "128", "129", "130", "140", "142", "143", "145", "144", "148")
B = c("NA", "5", "10", "6", "9", "1", "1", "NA", "2", "1", "2", "1", "4")
DF <- data.frame(ID, A, B)
I would like to create a new column (C), that is the sum of the absolute differences before and including each value, divided by (the number of measurements used to calculate it minus 1).
This is what I would like the data to look like:
I hope this makes sense, any help greatly appreciated!
Here's a tidyverse solution. You can first group_by the ID, then divide the cumulative sum (cumsum) of B by the row_number minus one. You can only do this after omitting the first row of each group and replacing it with NA
Note also that in your example, the 'numeric' columns are actually character vectors, so have to be coerced to numeric first.
library(tidyverse)
DF %>%
mutate(across(A:B, \(x) suppressWarnings(as.numeric(x)))) %>%
group_by(ID) %>%
mutate(C = c(NA, cumsum(B[-1])/(row_number() - 1)[-1]))
#> # A tibble: 13 x 4
#> # Groups: ID [2]
#> ID A B C
#> <chr> <dbl> <dbl> <dbl>
#> 1 1 120 NA NA
#> 2 1 115 5 5
#> 3 1 125 10 7.5
#> 4 1 119 6 7
#> 5 1 128 9 7.5
#> 6 1 129 1 6.2
#> 7 1 130 1 5.33
#> 8 2 140 NA NA
#> 9 2 142 2 2
#> 10 2 143 1 1.5
#> 11 2 145 2 1.67
#> 12 2 144 1 1.5
#> 13 2 148 4 2
Created on 2022-11-11 with reprex v2.0.2
I have a panel data (individuals observed in different time periods) like df, with much more individuals in my dataset:
id <- c("1", "1", "1", "2", "2", "2")
t <- c("1", "2", "3", "1", "2", "3")
w <- c("0.17", "NA", "NA", "0.23", "NA", "NA")
alpha <- c("0.15", "0.15", "0.15", "0.15", "0.15", "0.15")
rho <- c("0.10", "0.21", "0.32", "0.12", "0.2", "0.08")
df <- data.frame(id, t, w, alpha, rho)
I would like to fill w following these mathematical dynamics: w_id_t = w_id_t-1 * alpha + rho_id_t. However, I do not know how to move the outcome in such a way that appears in the following line, so I can continue with the dynamic calculations. The outcome should look like df_dynamics:
w_new <- c("0.17", "0.2355", "0.345", "0.23", "0.2345", "0.115")
df_dynamics <- data.frame(id, t, w_new, alpha, rho)
Any clue?
The columns are all character class. We may need to convert to numeric first and then do a group by operation
library(dplyr)
type.convert(df, as.is = TRUE) %>%
group_by(id) %>%
mutate(w = coalesce(w, lag(first(w) * first(alpha) + lead(rho)))) %>%
ungroup
# A tibble: 6 × 5
id t w alpha rho
<int> <int> <dbl> <dbl> <dbl>
1 1 1 0.17 0.15 0.1
2 1 2 0.236 0.15 0.21
3 1 3 0.346 0.15 0.32
4 2 1 0.23 0.15 0.12
5 2 2 0.234 0.15 0.2
6 2 3 0.114 0.15 0.08
I want to join "Division" from table2 to "Industry" in table1. To do so I will have to match the HSICCD from tablet1 that is between "from" and "to" in table2.
Just to be clear: If HSICCD in table1 is between from/to in table2, I want to take value from table2$division and add it to table1$industry(Or a new column like in a join).
Is there a join function in R that does this without too much hassle? (dplyr solutions are desired, but I'm glad for every contribution!)
**Table1:**
PERMNO HSICCD Industry
<dbl> <dbl> <lgl>
1 10000 3990 NA
2 10001 4925 NA
3 10002 6020 NA
4 10003 6020 NA
5 10004 5330 NA
6 10005 1310 NA
7 10006 3743 NA
8 10007 7370 NA
9 10008 3430 NA
10 10009 6030 NA
**Table2:**
from to division
<dbl> <dbl> <chr>
1 100 999 Agriculture
2 1000 1499 Mining
3 1500 1799 Construction
4 1800 1999 Other
5 2000 3999 Manufacturing
6 4000 4999 Transportation
7 5000 5199 Wholesale
8 5200 5999 Retail
9 6000 6799 Finance
10 7000 8999 Services
11 9100 9729 Public
12 9900 9999 Other
My only solution so far is this horrendous code:
Compustat_identifiers$Industry <- NA
for (hsiccd in 1:nrow(Compustat_identifiers)) {
for (SIC in 1:nrow(sic_table)) {
if (is.na(Compustat_identifiers$HSICCD[hsiccd]) == T) {
Compustat_identifiers$Industry[hsiccd] <- "Other"
} else if (Compustat_identifiers$HSICCD[hsiccd] >= sic_table$from[SIC] &
Compustat_identifiers$HSICCD[hsiccd] <= sic_table$to[SIC]) {
Compustat_identifiers$Industry[hsiccd] <- sic_table$division[SIC]
}
}
}
With base R you can do this.
First collect HSICCD from tbl1 and match with the condition to tbl2$division. Then simply put back into tbl1$Industry.
tbl1$Industry <- sapply( tbl1$HSICCD,
function(x) tbl2[ apply( tbl2, 1,
function(y) x>y[1]&x<y[2] ),"division"] )
PERMNO HSICCD Industry
1 10000 3990 Manufacturing
2 10001 4925 Transportation
3 10002 6020 Finance
4 10003 6020 Finance
5 10004 5330 Retail
6 10005 1310 Mining
7 10006 3743 Manufacturing
8 10007 7370 Services
9 10008 3430 Manufacturing
10 10009 6030 Finance
Data
tbl1 <- structure(list(PERMNO = 10000:10009, HSICCD = c(3990L, 4925L,
6020L, 6020L, 5330L, 1310L, 3743L, 7370L, 3430L, 6030L), Industry = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c("1", "2",
"3", "4", "5", "6", "7", "8", "9", "10"), class = "data.frame")
tbl2 <- structure(list(from = c(100L, 1000L, 1500L, 1800L, 2000L, 4000L,
5000L, 5200L, 6000L, 7000L, 9100L, 9900L), to = c(999L, 1499L,
1799L, 1999L, 3999L, 4999L, 5199L, 5999L, 6799L, 8999L, 9729L,
9999L), division = c("Agriculture", "Mining", "Construction",
"Other", "Manufacturing", "Transportation", "Wholesale", "Retail",
"Finance", "Services", "Public", "Other")), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))
I I understand correctly: We could try 2 steps:
Use fuzzy_inner_join to check and join whether HSICCD values are in the range from to to
left_join by HSICCD and clean results
library(dplyr)
library(fuzzyjoin)
result <- fuzzy_inner_join(Table1, Table2,
by = c(HSICCD = 'from', HSICCD = 'to'),
match_fun = list(`>=`, `<=`))
left_join(Table1, result, by="HSICCD") %>%
select(PERMNO=PERMNO.x, HSICCD, Industry=division)
PERMNO HSICCD Industry
1 10000 3990 Manufacturing
2 10001 4925 Transportation
3 10002 6020 Finance
4 10002 6020 Finance
5 10003 6020 Finance
6 10003 6020 Finance
7 10004 5330 Retail
8 10005 1310 Mining
9 10006 3743 Manufacturing
10 10007 7370 Services
11 10008 3430 Manufacturing
12 10009 6030 Finance
You can use ifelse():
for(i in 1:(dim(table2)[1])){
table1$Industry <- ifelse((table1$HSICCD>table2$from[i] & table1$HSICCD<table2$to[i]),
table2$division[i],
table1$Industry)
}
You could pivot_longer tbl2, i.e. join all available id numbers to tbl2 divisions in long fromat and use dplyr fill to completet the data frame:
tbl2_reformat <- tibble(value = c(100:10000)) %>%
#first join all available id numbers to tbl2 divisions in long fromat
left_join(tbl2 %>%
pivot_longer(-division) %>%
select(-name),
by = "value") %>%
fill(division)
afterwards just join:
result <- tbl1 %>%
as_tibble() %>%
select(-Industry) %>%
left_join(tbl2_reformat,by = c('HSICCD'='value')) %>%
dplyr::rename(Industry = division)
*edited with code fragments
structure(tibble(c("top", "jng", "mid", "bot", "sup"), c("369", "Karsa", "knight", "JackeyLove", "yuyanjia"),
c("Malphite", "Rek'Sai", "Zoe", "Aphelios", "Braum"), c("1", "1", "1", "1", "1"), c("7", "5", "7", "5", "0"),
c("6079-7578", "6079-7578", "6079-7578", "6079-7578", "6079-7578")), .Names = c("position", "player", "champion", "result", "kills", "gameid"))
Output:
# A tibble: 5 x 6
position player champion result kills gameid
* <chr> <chr> <chr> <chr> <chr> <chr>
1 top 369 Malphite 1 7 6079-7578
2 jng Karsa Rek'Sai 1 5 6079-7578
3 mid knight Zoe 1 7 6079-7578
4 bot JackeyLove Aphelios 1 5 6079-7578
5 sup yuyanjia Braum 1 0 6079-7578
My desired output would be:
structure(list(gameid = "6079-7578", result = "1", player_top = "369",
player_jng = "Karsa", player_mid = "knight", player_bot = "JackeyLove",
player_sup = "yuyanjia", champion_top = "Malphite", champion_jng = "Rek'Sai",
champion_mid = "Zoe", champion_bot = "Aphelios", champion_sup = "Braum",
kills_top = "7", kills_jng = "5", kills_mid = "7", kills_bot = "5",
kills_sup = "0"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))
which looks like this:
gameid result player_top player_jng player_mid player_bot player_sup champion_top champion_jng champion_mid champion_bot champion_sup
1 6079-7578 1 369 Karsa knight JackeyLove yuyanjia Malphite RekSai Zoe Aphelios Braum
kills_top kills_jng kills_mid kills_bot kills_sup
1 7 5 7 5 0
I know I should use pivot_wider() and something like drop_na, but I don't know how to do pivot_wider() with mutiple columns and collapse the rows at the same time. Any help would be appreciated.
You can use pivot_wider() for this, defining the "position" variable as the variable that the new column names come from in names_from and the three variables with values you want to use to fill those columns with as values_from.
By default the multiple values_from variables are pasted on to the front of new columns names. This can be changed, but in this case that matches the naming structure you want.
All other variables in the original dataset will be used as the id_cols in the order that they appear.
library(tidyr)
pivot_wider(dat,
names_from = "position",
values_from = c("player", "champion", "kills"))
#> result gameid player_top player_jng player_mid player_bot player_sup
#> 1 1 6079-7578 369 Karsa knight JackeyLove yuyanjia
#> champion_top champion_jng champion_mid champion_bot champion_sup kills_top
#> 1 Malphite Rek'Sai Zoe Aphelios Braum 7
#> kills_jng kills_mid kills_bot kills_sup
#> 1 5 7 5 0
You can control the order of your id columns in the output by explicitly writing them out via id_cols. Here's an example, matching your desired output.
pivot_wider(dat, id_cols = c("gameid", "result"),
names_from = "position",
values_from = c("player", "champion", "kills"))
#> gameid result player_top player_jng player_mid player_bot player_sup
#> 1 6079-7578 1 369 Karsa knight JackeyLove yuyanjia
#> champion_top champion_jng champion_mid champion_bot champion_sup kills_top
#> 1 Malphite Rek'Sai Zoe Aphelios Braum 7
#> kills_jng kills_mid kills_bot kills_sup
#> 1 5 7 5 0
Created on 2021-06-24 by the reprex package (v2.0.0)
Using data.table might help here. In dcast() each row will be identified by a unique combo of gameid and result, the columns will be spread by position, and filled with values from the variables listed in value.var.
library(data.table)
library(dplyr)
df <- structure(tibble(c("top", "jng", "mid", "bot", "sup"), c("369", "Karsa", "knight", "JackeyLove", "yuyanjia"),
c("Malphite", "Rek'Sai", "Zoe", "Aphelios", "Braum"), c("1", "1", "1", "1", "1"), c("7", "5", "7", "5", "0"),
c("6079-7578", "6079-7578", "6079-7578", "6079-7578", "6079-7578")), .Names = c("position", "player", "champion", "result", "kills", "gameid"))
df2 <- dcast(setDT(df), gameid + result~position, value.var = list('player','champion','kills'))
I would like to know how to concatenate string to form sequences of different and varying lengths & varying content according to one condition.
Here is a dataframe example (my DF is actually about 60000 rows).
column index: just an index
to_concat: the string item i want to concatenate
max_seq: one example of the condition for concatenation (to_concat should only concatenate if it is part of the same sequence - and I have indicated the position of the string in the sequence for now.
concat_result: The result I would like to have
index to_concat max_seq concat_result
1 Abc! 1 <abc!+def+_>
2 def 2 <abc!+def+_>
3 _ 3 <abc!+def+_>
4 x93 1 <x93+afza+5609+5609+Abc!+def>
5 afza 2 <x93+afza+5609+5609+Abc!+def>
6 5609 3 <x93+afza+5609+5609+Abc!+def>
7 5609 4 <x93+afza+5609+5609+Abc!+def>
8 Abc! 5 <x93+afza+5609+5609+Abc!+def>
9 def 6 <x93+afza+5609+5609+Abc!+def>
10 _ 1 <_+x93+afza>
11 x93 2 <_+x93+afza>
12 afza 3 <_+x93+afza>
I know of paste & aggregate, length, probably usefull.. But do not see in which order to do that and especially how to formulate the paste.
I suppose I should also include an "second" index better done for max_seq (such as : all strings to be concatenated in the same sequence have the same number so here we would have a 3 sequences " 1 1 1 2 2 2 2 2 2 3 3 3 ".
But I do not know if that is the quickest/easiest solution and also I do not know how to paste varying length...
Could you please help a fellow PhD? Thanks a lot in advance.
Reproductible example:
dput(dat)
> dput(dat)
structure(list(V1 = c("index", "1", "2", "3", "4", "5", "6",
"7", "8", "9", "10", "11", "12"), V2 = c("to_concat", "Abc!",
"def", "_", "x93", "afza", "5609", "5609", "Abc!", "def", "_",
"x93", "afza"), V3 = c("max_seq", "1", "2", "3", "1", "2", "3",
"4", "5", "6", "1", "2", "3"), V4 = c("concat_result", "<abc!+def+_>",
"<abc!+def+_>", "<abc!+def+_>", "<x93+afza+5609+5609+Abc!+def>",
"<x93+afza+5609+5609+Abc!+def>", "<x93+afza+5609+5609+Abc!+def>",
"<x93+afza+5609+5609+Abc!+def>", "<x93+afza+5609+5609+Abc!+def>",
"<x93+afza+5609+5609+Abc!+def>", "<_+x93+afza>", "<_+x93+afza>",
"<_+x93+afza>")), .Names = c("V1", "V2", "V3", "V4"), class = "data.frame", row.names = c(NA,
-13L))
Several options to get the desired result:
1) Using base R:
mydf$grp <- cumsum(mydf$max_seq < c(1,head(mydf$max_seq, -1))) + 1
mydf$concat_result <- ave(mydf$to_concat, mydf$grp,
FUN = function(x) paste0('<',paste(x,collapse='+'),'>'))
which gives:
> mydf
index to_concat max_seq grp concat_result
1 1 Abc! 1 1 <Abc!+def+_>
2 2 def 2 1 <Abc!+def+_>
3 3 _ 3 1 <Abc!+def+_>
4 4 x93 1 2 <x93+afza+5609+5609+Abc!+def>
5 5 afza 2 2 <x93+afza+5609+5609+Abc!+def>
6 6 5609 3 2 <x93+afza+5609+5609+Abc!+def>
7 7 5609 4 2 <x93+afza+5609+5609+Abc!+def>
8 8 Abc! 5 2 <x93+afza+5609+5609+Abc!+def>
9 9 def 6 2 <x93+afza+5609+5609+Abc!+def>
10 10 _ 1 3 <_+x93+afza>
11 11 x93 2 3 <_+x93+afza>
12 12 afza 3 3 <_+x93+afza>
2) Or using the data.table package:
library(data.table)
setDT(mydf)[, grp := cumsum(max_seq < shift(max_seq, fill = 0))+1
][, concat_result := paste0('<',paste(to_concat,collapse='+'),'>'), grp][]
3) Or using the dplyr package:
library(dplyr)
mydf %>%
mutate(grp = cumsum(max_seq < lag(max_seq, n=1, default=0))+1) %>%
group_by(grp) %>%
mutate(concat_result = paste0('<',paste(to_concat,collapse='+'),'>'))
Used data:
mydf <- structure(list(index = 1:12,
to_concat = c("Abc!", "def", "_", "x93", "afza", "5609", "5609", "Abc!", "def", "_", "x93", "afza"),
max_seq = c(1L, 2L, 3L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L)),
.Names = c("index", "to_concat", "max_seq"), class = "data.frame", row.names = c(NA, -12L))