Creating a list of column values by group - r

I have data as follows:
dat_in <- structure(list(Name = c("Name_a", "Name_a", "Name_a", "Name_a",
"Name_a", "Name_b", "Name_b", "Name_b", "Name_b"), freq = c(5,
4, 3, 2, 14, 1, 6, 0, 7), colspan = c(bb.25 = 1, bb.100 = 2,
bb.500 = 2, bb.Infinity = 4, bb.SUM = 1, aa.25 = 1, aa.3000 = 7,
aa.Infinity = 1, aa.SUM = 1), width = c(bb.25 = 50, bb.100 = 100,
bb.500 = 100, bb.Infinity = 200, bb.SUM = 50, aa.25 = 50, aa.3000 = 350,
aa.Infinity = 50, aa.SUM = 50)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -9L))
# A tibble: 9 x 4
Name freq colspan width
<chr> <dbl> <dbl> <dbl>
1 Name_a 5 1 50
2 Name_a 4 2 100
3 Name_a 3 2 100
4 Name_a 2 4 200
5 Name_a 14 1 50
6 Name_b 1 1 50
7 Name_b 6 7 350
8 Name_b 0 1 50
9 Name_b 7 1 50
I would like to create a column with list of colspans:
dat_out <- structure(list(rn = c("Name_a", "Name_b"), colspan= list(
c(1, 2, 2, 4, 1), c(1, 1, 7, 1))), row.names = c(NA,
-2L), class = c("tbl_df", "tbl", "data.frame"))
How do I create a list of colspan by name?

By dplyr, you can group data by Name and then summarise each group with list.
library(dplyr)
dat_in %>%
group_by(Name) %>%
summarise(colspan = list(colspan))
# # A tibble: 2 × 2
# Name colspan
# <chr> <list>
# 1 Name_a <dbl [5]>
# 2 Name_b <dbl [4]>

Related

manipulate a pair data in R

I would like to reshape the data sample below, so that to get the output like in the table. How can I reach to that? the idea is to split the column e into two columns according to the disease. Those with disease 0 in one column and those with disease 1 in the other column. thanks in advance.
structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), fid = c(1,
1, 2, 2, 3, 3, 4, 4, 5, 5), disease = c(0, 1, 0, 1, 1, 0, 1, 0, 0,
1), e = c(3, 2, 6, 1, 2, 5, 2, 3, 1, 1)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))
library(tidyverse)
df %>%
pivot_wider(fid, names_from = disease, values_from = e, names_prefix = 'e') %>%
select(-fid)
e0 e1
<dbl> <dbl>
1 3 2
2 6 1
3 5 2
4 3 2
5 1 1
if you want the e1,e2 you could do:
df %>%
pivot_wider(fid, names_from = disease, values_from = e,
names_glue = 'e{disease + 1}') %>%
select(-fid)
# A tibble: 5 x 2
e1 e2
<dbl> <dbl>
1 3 2
2 6 1
3 5 2
4 3 2
5 1 1
We could use lead() combined with ìfelse statements for this:
library(dplyr)
df %>%
mutate(e2 = lead(e)) %>%
filter(row_number() %% 2 == 1) %>%
mutate(e1 = ifelse(disease==1, e2,e),
e2 = ifelse(disease==0, e2,e)) %>%
select(e1, e2)
e1 e2
<dbl> <dbl>
1 3 2
2 6 1
3 5 2
4 3 2
5 1 1

how to build a string variable to capture muti cols info

I have a df that looks like this:
It can be build using codes:
structure(list(ID = c(1, 2, 3, 4, 5), Pass = c(0, 1, 1, 1, 1),
Math = c(0, 0, 1, 1, 1), ELA = c(0, 1, 0, 1, 0), PE = c(0,
0, 1, 1, 1)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
Where pass stand for a student pass any test or not. Now I want to build a new var Result to capture a student's test results like following, what should I do?
Try the base R code below
q <- with(data.frame(which(df[-(1:2)] == 1, arr.ind = TRUE)),
tapply(names(df[-(1:2)])[col], factor(row, levels = 1:nrow(df)), toString))
df$Result <- ifelse(is.na(q), "Not Pass", paste0("Pass: ", q))
which gives
> df
# A tibble: 5 x 6
ID Pass Math ELA PE Result
<dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 1 0 0 0 0 Not Pass
2 2 1 0 1 0 Pass: ELA
3 3 1 1 0 1 Pass: Math, PE
4 4 1 1 1 1 Pass: Math, ELA, PE
5 5 1 1 0 1 Pass: Math, PE
Using dplyr with rowwise
library(dplyr)
library(stringr)
df1 %>%
rowwise %>%
mutate(Result = if(as.logical(Pass))
str_c('Pass: ', toString(names(select(., Math:PE))[as.logical(c_across(Math:PE))])) else 'Not pass' ) %>%
ungroup
# A tibble: 5 x 6
# ID Pass Math ELA PE Result
# <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#1 1 0 0 0 0 Not pass
#2 2 1 0 1 0 Pass: ELA
#3 3 1 1 0 1 Pass: Math, PE
#4 4 1 1 1 1 Pass: Math, ELA, PE
#5 5 1 1 0 1 Pass: Math, PE
data
df1 <- structure(list(ID = c(1, 2, 3, 4, 5), Pass = c(0, 1, 1, 1, 1),
Math = c(0, 0, 1, 1, 1), ELA = c(0, 1, 0, 1, 0), PE = c(0,
0, 1, 1, 1)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
Here's one solution:
library(dplyr)
library(magrittr)
library(stringr)
df <- structure(list(ID = c(1, 2, 3, 4, 5), Pass = c(0, 1, 1, 1, 1),
Math = c(0, 0, 1, 1, 1), ELA = c(0, 1, 0, 1, 0), PE = c(0,
0, 1, 1, 1)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
df %<>% pivot_longer(cols = -c(ID, Pass), names_to = "sub", values_to = "done")
df %<>% group_by(ID) %>% mutate(Result = paste0(ifelse(done == 1, sub, NA), collapse = ", ")) %>% ungroup()
df %<>% pivot_wider(names_from = sub, values_from = done)
df %<>% mutate(Result = paste0("Pass: ", str_replace_all(Result, "NA[, ]*", "")))
df %<>% mutate(Result = ifelse(str_detect(Result, "Pass: $"), "Not pass", str_replace_all(Result, ",[\\s]*$", "")))
df
# # A tibble: 5 x 6
# ID Pass Result Math ELA PE
# <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
# 1 1 0 Not pass 0 0 0
# 2 2 1 Pass: ELA 0 1 0
# 3 3 1 Pass: Math, PE 1 0 1
# 4 4 1 Pass: Math, ELA, PE 1 1 1
# 5 5 1 Pass: Math, PE 1 0 1
I can provide an explanation of what the code is doing if necessary.

Reshaping data by appending rows from different groups to the same row

I have data as follows:
DT <- structure(list(Area = c("A", "A", "A", "A", "B", "B", "B", "B"
), Year = c(1, 1, 2, 2, 1, 1, 2, 2), Group = c(1, 2, 1, 2, 1,
2, 1, 2), Population_Count = c(10, 12, 10, 12, 10, 13, 10, 11
), Male_Count = c(5, 7, 5, 4, 5, 8, 5, 6), Female_Count = c(5,
5, 5, 8, 5, 5, 5, 5)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
# A tibble: 8 x 6
Area Year Group Population_Count Male_Count Female_Count
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 1 10 5 5
2 A 1 2 12 7 5
3 A 2 1 10 5 5
4 A 2 2 12 4 8
5 B 1 1 10 5 5
6 B 1 2 13 8 5
7 B 2 1 10 5 5
8 B 2 2 11 6 5
I would like to keep one observations per Area-Year, without losing any information. I tried to do
DTcast <- dcast(DT, Area + Year ~ Group + Population_Count + Male_Count + Female_Count)
But that results in a lot of rubbish:
Area Year 1_10_5_5 2_11_6_5 2_12_4_8 2_12_7_5 2_13_8_5
1 A 1 5 NA NA 5 NA
2 A 2 5 NA 8 NA NA
3 B 1 5 NA NA NA 5
4 B 2 5 5 NA NA NA
In addition, when I apply it to the actual data, I get:
Using 'H_FEMALE' as value column. Use 'value.var' to override
Error in CJ(1:72284, 1:1333365) :
Cross product of elements provided to CJ() would result in 96380955660 rows which exceeds .Machine$integer.max == 2147483647
So I think I am doing something wrong. I think it maybe has to do with the value.var which I do not know how to select.
Desired result:
# A tibble: 4 x 9
Area Year Group `Population_Count_ Group_1` `Male_Count_ Group_1` `Female_Count_ Group_1` `Population_Count_ Group_2` `Male_Count_ Group_2` `Female_Count_ Group_2`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 1 10 5 5 12 7 5
2 A 2 1 10 5 5 12 4 8
3 B 1 1 10 5 5 13 8 5
4 B 2 1 10 5 5 11 6 5
library(tidyverse)
DT %>% pivot_wider(id_cols = c("Area", "Year"), names_from = "Group", values_from = 4:6)
> DT %>% pivot_wider(id_cols = c("Area", "Year"), names_from = "Group", values_from = 4:6)
# A tibble: 4 x 8
Area Year Population_Count_1 Population_Count_2 Male_Count_1 Male_Count_2 Female_Count_1 Female_Count_2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 10 12 5 7 5 5
2 A 2 10 12 5 4 5 8
3 B 1 10 13 5 8 5 5
4 B 2 10 11 5 6 5 5
This will name your columns as desired
DT %>% pivot_wider(id_cols = c("Area", "Year"),
names_from = "Group",
values_from = 4:6,
names_sep = "_Group_")
use data.table
library(data.table)
dt <- structure(list(Area = c("A", "A", "A", "A", "B", "B", "B", "B"
), Year = c(1, 1, 2, 2, 1, 1, 2, 2), Group = c(1, 2, 1, 2, 1,
2, 1, 2), Population_Count = c(10, 12, 10, 12, 10, 13, 10, 11
), Male_Count = c(5, 7, 5, 4, 5, 8, 5, 6), Female_Count = c(5,
5, 5, 8, 5, 5, 5, 5)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
setDT(dt)
dcast(
dt,
formula = Area + Year ~ Group,
value.var = grep("_Count", names(dt), value = T)
)
#> Area Year Population_Count_1 Population_Count_2 Male_Count_1 Male_Count_2
#> 1: A 1 10 12 5 7
#> 2: A 2 10 12 5 4
#> 3: B 1 10 13 5 8
#> 4: B 2 10 11 5 6
#> Female_Count_1 Female_Count_2
#> 1: 5 5
#> 2: 5 8
#> 3: 5 5
#> 4: 5 5
Created on 2020-12-18 by the reprex package (v0.3.0)

Binning by Subgroup in R

I have a dataframe with Markets, Retailers and Sales. I need to bin the Retailers within each Market into 5 quantiles.
Example:
dataframe <- structure(list(Market = c(1, 1, 1, 2, 2, 2), Retailer = c(1,
2, 3, 4, 5, 6), Sales = c(5, 10, 25, 5, 10, 25), Quantile = c(1,
2, 3, 1, 2, 3)), class = "data.frame", row.names = c(NA, -6L))
One approach is using group_by and ntile from dplyr:
library(dplyr)
dataframe %>%
group_by(Market) %>%
mutate(Quantile = ntile(Sales, 4))
# A tibble: 150 x 4
# Groups: Market [3]
Market Retailer Sales Quantile
<int> <int> <dbl> <int>
1 1 1 16804 1
2 1 2 80752 4
3 1 3 38494 2
4 1 4 32773 2
5 1 5 60210 3
# … with 145 more rows
Data
set.seed(3)
dataframe <- data.frame(Market = rep(1:3, each = 50),
Retailer = rep(1:50, times = 3),
Sales = round(runif(150,0,100000),0))

build a network edge table from a sparse table

I don't know exactly how to explain it but...
I have a sparse table where each group represents a level. The columns are ordered, it means, the downstream (left) column represents a child node and upstream (right) node represents a parent node.
I'd like a two columns table where the 1st column is the parent node and the 2nd is the child node. If possible, a 3rd columns with the length (sum of the number of final nodes) of the parents.
Follow the example:
>tt <- tibble(
ID = letters[1:8],
`1` = c( 1, 1, 1, 1, 2, 2, 2, 2),
`2` = c( 3, 3, 4, 4, 5, 5, 5, 6),
`3` = c( 7, 7, 8, 9,10,10,11,12)
)
> tt
# A tibble: 8 x 4
ID `1` `2` `3`
<chr> <dbl> <dbl> <dbl>
1 a 1 3 7
2 b 1 3 7
3 c 1 4 8
4 d 1 4 9
5 e 2 5 10
6 f 2 5 10
7 g 2 5 11
8 h 2 6 12
>dput(tt)
structure(list(ID = c("a", "b", "c", "d", "e", "f", "g", "h"),
`1` = c(1, 1, 1, 1, 2, 2, 2, 2), `2` = c(3, 3, 4, 4, 5, 5,
5, 6), `3` = c(7, 7, 8, 9, 10, 10, 11, 12)), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
the result should be:
>ttt <- tibble(
parent = c(1,1,2,2,3,4,4, 5, 5, 6, 7,7,8,9,10,10,11,12),
child = c(3,4,5,6,7,8,9,10,11,12, letters[1:8] ),
length = c(4,4,4,4,2,2,2, 3, 3, 1, 2,2,1,1, 2, 2, 1, 1)
)
>ttt
# A tibble: 18 x 3
parent child length
<dbl> <chr> <dbl>
1 1 3 4
2 1 4 4
3 2 5 4
4 2 6 4
5 3 7 2
6 4 8 2
7 4 9 2
8 5 10 3
9 5 11 3
10 6 12 1
11 7 a 2
12 7 b 2
13 8 c 1
14 9 d 1
15 10 e 2
16 10 f 2
17 11 g 1
18 12 h 1
> dput(ttt)
structure(list(parent = c(1, 1, 2, 2, 3, 4, 4, 5, 5, 6, 7, 7,
8, 9, 10, 10, 11, 12), child = c("3", "4", "5", "6", "7", "8",
"9", "10", "11", "12", "a", "b", "c", "d", "e", "f", "g", "h"
), length = c(4, 4, 4, 4, 2, 2, 2, 3, 3, 1, 2, 2, 1, 1, 2, 2,
1, 1)), row.names = c(NA, -18L), class = c("tbl_df", "tbl", "data.frame"
))
Any help is appreciated.
Thanks in advance.
This gets you 90% of the way there:
tt_correct <- tt[, c(2,3,4,1)]
ttt <- do.call(
rbind,
lapply(seq_len(length(tt)-1),
function(i){
DF <- tt_correct[, c(i, i+1)]
names(DF) <- c('parent', 'child')
DF$length <- ave(DF$parent, DF$parent, FUN = length)
unique(DF)
}
)
)
ttt
# A tibble: 18 x 3
parent child length
<dbl> <chr> <dbl>
1 1 3 4
2 1 4 4
3 2 5 4
4 2 6 4
5 3 7 2
6 4 8 2
7 4 9 2
8 5 10 3
9 5 11 3
10 6 12 1
11 7 a 2
12 7 b 2
13 8 c 1
14 9 d 1
15 10 e 2
16 10 f 2
17 11 g 1
18 12 h 1
The first part is correcting the order. Your expected output indicates that the 1st column is a child of the 4th column. The lapply() statement largely walks along the data.frame and stacks the data.
This is 90% of the way because the answer doesn't agree with your expected output for lengths. I think this is correct but I could be wrong.
Finally, and I'm not that good with igraph, you could likely find additional information doing:
library(igraph)
plot(graph_from_data_frame(ttt[, 1:2]))

Resources