Add column and row total in ftable - r

I use the ftable to make a table like this:
HPV-16 negative positive
Sex HPV-55
female negative 2341 4
positive 11 0
male negative 2140 23
positive 25 2
Here is the dput code.
structure(c(2341L, 11L, 2140L, 25L, 4L, 0L, 23L, 2L), .Dim = c(4L,
2L), class = "ftable", row.vars = list(Sex = c("female", "male"
), `HPV-55` = c("negative", "positive")), col.vars = list(`HPV-16` = c("negative",
"positive")))
And a sample data of the original data:
structure(list(sex = structure(c(2L, 2L, 1L, 1L, 2L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("female",
"male"), class = c("labelled", "factor"), label = "sex"), orxh16 = structure(c(1L,
1L, 1L, NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("negative", "positive"), class = c("labelled",
"factor"), label = "hpv16"), orxh55 = structure(c(1L, 1L, 1L,
NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("negative", "positive"), class = c("labelled",
"factor"), label = "hpv55")), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
I tried addmargins: addmargins(tab1, FUN = list(Total=sum), quiet = T), but the detailed information e.g., row names and the layout will be lost.
Total
2341 4 2345
11 0 11
2140 23 2163
25 2 27
Total 4517 29 4546
I'm wondering if there is a way to add the column and row total and meanwhile let the layout of the table looks like before (as below)? Thank you!
HPV-16 negative positive Total
Sex HPV-55
female negative 2341 4 2345
positive 11 0 11
male negative 2140 23 2163
positive 25 2 27
Total 4517 29 4546

addmargins should be used before ftable.
xtab1 <- xtabs(~ sex + orxh55 + orxh16, df)
ftable(addmargins(xtab1, margin = 2:3, list(Total = sum)))
# Margins computed over dimensions
# in the following order:
# 1: orxh55
# 2: orxh16
# orxh16 negative positive Total
# sex orxh55
# female negative 10 0 10
# positive 0 0 0
# Total 10 0 10
# male negative 9 0 9
# positive 0 0 0
# Total 9 0 9
Sample Data
df <- structure(list(sex = structure(c(2L, 2L, 1L, 1L, 2L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("female",
"male"), class = c("labelled", "factor"), label = "sex"), orxh16 = structure(c(1L,
1L, 1L, NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("negative", "positive"), class = c("labelled",
"factor"), label = "hpv16"), orxh55 = structure(c(1L, 1L, 1L,
NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("negative", "positive"), class = c("labelled",
"factor"), label = "hpv55")), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))

Just found a way to do this, using summarytool::ctable. The tabulation is separated by the grouping variable though, it can provide similar tables as in the ftable.
tab2 <- with(hpv2, stby(list(x=orxh55, y=orxh16),
sex, ctable, prop="n", useNA="no", dnn = c("HPV_55", "HPV-16")))
Cross-Tabulation
HPV_55 * HPV-16
Data Frame: hpv2
Group: sex = female
---------- -------- ---------- ---------- -------
HPV-16 negative positive Total
HPV_55
negative 2341 4 2345
positive 11 0 11
Total 2352 4 2356
---------- -------- ---------- ---------- -------
Group: sex = male
---------- -------- ---------- ---------- -------
HPV-16 negative positive Total
HPV_55
negative 2140 23 2163
positive 25 2 27
Total 2165 25 2190

Related

How to create in R new column calculating mean (3 previous grouped rows) of numerical variable grouping by variables (factors)?

I have tried using rollapply but I can't get the desired result.
These are the columns(sample) of the dataset on which I want to do the calculations.
structure(list(LeagueROUND = structure(c(1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 4L), .Label = c("1", "2",
"3", "4"), class = "factor"), League = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Portugal2", class = "factor"),
Season = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "2021/2022", class = "factor"),
DRAWmarginODDS = structure(c(2L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
DRAWnumODDS = c(0L, NA, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L,
0L, 0L, NA, 0L)), .Names = c("LeagueROUND", "League", "Season",
"DRAWmarginODDS", "DRAWnumODDS"), class = "data.frame", row.names = c(NA,
-15L))
Desired result
Group by( LeagueROUND,League,Season,DRAWmarginODDS)
Average(mean) of (DRAWnumODDS) of 3 previous LeagueROUNDs
That is:
League Round 1 (Yes) adds 1(DRAWnumODDS) in 3(grouped) rows.
League Round 2 (Yes) adds 2(DRAWnumODDS) in 4(grouped) rows
League Round 3 (Yes) adds 0(DRAWnumODDS) in 3(grouped) rows
Desired:
In League Round 4(Yes) (average of 3 previous League Round) = 3(DRAWnumODDS) in 10(grouped) rows = mean 0,3
League Round (No) = NA
3 first LeagueROUND -> NA
library(tidyverse)
data <- structure(list(
LeagueROUND = structure(c(
1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 4L
), .Label = c(
"1", "2",
"3", "4"
), class = "factor"), League = structure(c(
1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = "Portugal2", class = "factor"),
Season = structure(c(
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L
), .Label = "2021/2022", class = "factor"),
DRAWmarginODDS = structure(c(
2L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L
), .Label = c("No", "Yes"), class = "factor"),
DRAWnumODDS = c(
0L, NA, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L,
0L, 0L, NA, 0L
)
), .Names = c(
"LeagueROUND", "League", "Season",
"DRAWmarginODDS", "DRAWnumODDS"
), class = "data.frame", row.names = c(
NA,
-15L
))
data %>%
mutate(LeagueROUND = as.integer(LeagueROUND)) %>%
group_by(DRAWmarginODDS, LeagueROUND) %>%
summarise(DRAWnumODDS = sum(DRAWnumODDS, na.rm = TRUE)) %>%
ungroup() %>%
filter(DRAWmarginODDS == "Yes") %>%
arrange(LeagueROUND) %>%
mutate(
n_observations = LeagueROUND %>% map_int(~ {
data %>%
mutate(LeagueROUND = as.integer(LeagueROUND)) %>%
filter(LeagueROUND < .x & DRAWmarginODDS == "Yes") %>%
nrow()
}),
mean_last_3_DRAWnumODDS = (lag(DRAWnumODDS, 1) + lag(DRAWnumODDS, 2) + lag(DRAWnumODDS, 3)) / n_observations
) %>%
mutate(across(everything(), as.character)) %>%
right_join(data %>% mutate(across(everything(), as.character))) %>%
type_convert()
#> `summarise()` has grouped output by 'DRAWmarginODDS'. You can override using the `.groups` argument.Joining, by = c("DRAWmarginODDS", "LeagueROUND", "DRAWnumODDS")
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> DRAWmarginODDS = col_character(),
#> LeagueROUND = col_double(),
#> DRAWnumODDS = col_double(),
#> n_observations = col_double(),
#> mean_last_3_DRAWnumODDS = col_double(),
#> League = col_character(),
#> Season = col_character()
#> )
#> # A tibble: 15 × 7
#> DRAWmarginODDS LeagueROUND DRAWnumODDS n_observations mean_last_3_DRA… League
#> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 Yes 1 1 0 NA Portu…
#> 2 Yes 3 0 7 NA Portu…
#> 3 Yes 3 0 7 NA Portu…
#> 4 Yes 3 0 7 NA Portu…
#> 5 Yes 4 0 10 0.3 Portu…
#> 6 Yes 4 0 10 0.3 Portu…
#> 7 Yes 4 0 10 0.3 Portu…
#> 8 Yes 1 0 NA NA Portu…
#> 9 No 1 NA NA NA Portu…
#> 10 Yes 1 0 NA NA Portu…
#> 11 Yes 2 0 NA NA Portu…
#> 12 Yes 2 0 NA NA Portu…
#> 13 Yes 2 1 NA NA Portu…
#> 14 Yes 2 1 NA NA Portu…
#> 15 No 4 NA NA NA Portu…
#> # … with 1 more variable: Season <chr>
Created on 2022-03-15 by the reprex package (v2.0.0)

Why this code is not right statically in ggplot to get percentage in y-axis?

I have this data, and I want to get percentage in y-axis.
structure(list(sb_1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("0", "x"), class = "factor"),
sb_2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "0", class = "factor"), sb_3 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "b", class = "factor"),
sb_4 = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("0", "c"), class = "factor"), wave = structure(c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("h",
"j"), class = "factor")), row.names = c(NA, 12L), class = "data.frame")
This the code I have used:
nn%>%
pivot_longer(cols = starts_with("sb_")) %>%
filter(value != 0) %>%
unite(sb_, name, value) %>%
group_by(wave) %>%
mutate(wave_total = n()) %>%
group_by(sb_, .add = TRUE) %>%
mutate(sb_pct = 100 * n() / wave_total) %>%
ggplot(aes(x = factor(sb_, levels = str_sort(unique(sb_), numeric = TRUE)), y = sb_pct)) +
geom_bar(aes(fill = wave), stat = "identity", position = position_dodge(preserve = "single")) +
xlab("sb") +
ylab("percent")
And the outcome is that :
![1]
And the result should be different because for instance for the first column, there was no zero and all is the outcome.
sb_1 sb_2 sb_3 sb_4 wave
1 0 0 b 0 h
2 0 0 b 0 j
3 0 0 b 0 h
4 0 0 b c j
5 0 0 b c h
6 0 0 b c j
7 x 0 b c h
8 x 0 b c j
9 x 0 b c h
10 x 0 b c j
11 x 0 b c h
12 x 0 b c j
So please help me why is not correct?
I can't tell why your code isn't correct, but I tried a different way and it seems to work as expected:
n <- structure(list(sb_1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("0", "x"), class = "factor"),
sb_2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "0", class = "factor"), sb_3 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "b", class = "factor"),
sb_4 = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("0", "c"), class = "factor"), wave = structure(c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("h",
"j"), class = "factor")), row.names = c(NA, 12L), class = "data.frame")
n <- pivot_longer(n, cols = starts_with("sb_"))
n$wave_and_name <- as.factor(paste(n$wave,n$name, sep="_"))
n <- as.data.frame(table(filter(n, value != 0)$wave_and_name) / table(n$wave_and_name) * 100)
n$wave <- substr(n$Var1, 1, 1)
n$name <- substr(n$Var1, 3, 6)
ggplot(n, aes(x=name, y=Freq)) +
geom_bar(aes(fill = wave), stat="identity",position = position_dodge()) +
xlab("sb") +
ylab("percent")

Why looping in 1 to unique value returns 1

I have the following data:
Class Identifier Configuration Total_individuals Total_goals Step 1 2 3 4 5
Class Identifier Configuration Total_individuals Total_goals Step Root IFNE IFNE IFEQ IFEQ
Class Identifier Configuration Total_individuals Total_goals Step Root true false true false
JDayChooser d6978cda No_Reduction 1000 208 1 0 11 11 11 11
JDayChooser d6978cda No_Reduction 1000 208 2 0 11 11 11 11
JDayChooser d6978cda No_Reduction 1000 208 3 0 11 11 11 11
JDayChooser d6978cda No_Reduction 1000 208 4 0 11 11 11 11
JDayChooser d6978cda No_Reduction 1000 208 5 0 11 11 11 11
The first two lines give some information that will be used later, but for now I delete them. Then I need to use a loop with the limit of the number of Total_goals:
df <- read.csv("")
df <- df[-c(1:2), ] #to delete the first two lines
total_branches <- unique(df$Total_goals)
for(j in 1:total_branches){
print(j)
}
This gives the following results:
[1] 208
Levels: 208 Total_goals
[1] 1
First of all, why is it still printing the word Total_goals in Levels although I removed the lines that contain this value? Also, why the loop does not work? it only prints 1.
Reproducible data:
structure(list(Class = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L
), .Label = c("accessories.plugins.time.JDayChooser", "Class"
), class = "factor"), Identifier = structure(c(2L, 2L, 1L, 1L,
1L, 1L, 1L), .Label = c("d6978cda", "Identifier"), class = "factor"),
Configuration = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("Configuration",
"No_Reduction"), class = "factor"), Total_individuals = structure(c(2L,
2L, 1L, 1L, 1L, 1L, 1L), .Label = c("1000", "Total_individuals"
), class = "factor"), Total_goals = structure(c(2L, 2L, 1L,
1L, 1L, 1L, 1L), .Label = c("208", "Total_goals"), class = "factor"),
Step = structure(c(6L, 6L, 1L, 2L, 3L, 4L, 5L), .Label = c("1",
"2", "3", "4", "5", "Step"), class = "factor"), X1 = structure(c(2L,
2L, 1L, 1L, 1L, 1L, 1L), .Label = c("0", "Root"), class = "factor"),
X2 = structure(c(2L, 3L, 1L, 1L, 1L, 1L, 1L), .Label = c("11",
"IFNE", "true"), class = "factor"), X3 = structure(c(3L,
2L, 1L, 1L, 1L, 1L, 1L), .Label = c("11", "false", "IFNE"
), class = "factor"), X4 = structure(c(2L, 3L, 1L, 1L, 1L,
1L, 1L), .Label = c("11", "IFEQ", "true"), class = "factor"),
X5 = structure(c(3L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("11",
"false", "IFEQ"), class = "factor")), class = "data.frame", row.names = c(NA,
-7L))
The answer to both of your question is because the columns is of class factor.
When you do :
df <- df[-c(1:2), ]
You remove the rows but the factor levels are still there.
levels(df$Total_goals)
#[1] "208" "Total_goals"
To get rid of that you need to use droplevels.
df <- droplevels(df[-c(1:2), ])
levels(df$Total_goals)
#[1] "208"
Now even if you have dropped the level Total_goals is still a factor. To convert to numeric do
df$Total_goals <- as.numeric(as.character(df$Total_goals))
and then run the for loop
total_branches <- unique(df$Total_goals)
for(j in 1:total_branches){
print(j)
}

how to return a column in the query based on the date in another table in R

I try describe in detail.
That's the first table
itogo=structure(list(cfo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "НСК", class = "factor"), code = structure(c(1L,
2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c("PlanНСК1", "PlanНСК2",
"PlanНСК3", "PlanНСК4", "PlanНСК5"), class = "factor"), customer_name = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "ТС", class = "factor"),
sales_volume_name = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "от 50 до 100 кг", class = "factor"),
shop_group_name = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "loc", class = "factor"), product_id = c(11628L,
11628L, 11628L, 11628L, 11628L, 11709L, 11709L, 11709L, 11709L,
11709L), shop_code = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = "02293НСК", class = "factor"),
rpost2019.01 = c(3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L),
rpost2019.02 = c(2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L),
rpost2019.03 = c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L),
rpost2019.04 = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
rpost2019.05 = c(1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L),
rpost2019.06 = c(2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L),
regplan.2019.01.meanot.2018.01_amount = c(3794.4, 3794.4,
3794.4, 3794.4, 3794.4, 2514.54, 2514.54, 2514.54, 2514.54,
2514.54), regplan.2019.02.meanot.2018.02_amount = c(2529.6,
2529.6, 2529.6, 2529.6, 2529.6, 1436.88, 1436.88, 1436.88,
1436.88, 1436.88), regplan.2019.03.meanot.2018.03_amount = c(4637.6,
4637.6, 4637.6, 4637.6, 4637.6, 3592.2, 3592.2, 3592.2, 3592.2,
3592.2), regplan.2019.04.meanot.2018.04_amount = c(2529.6,
2529.6, 2529.6, 2529.6, 2529.6, 2873.76, 2873.76, 2873.76,
2873.76, 2873.76), regplan.2019.05.meanot.2018.05_amount = c(1054,
1054, 1054, 1054, 1054, 3412.58982, 3412.58982, 3412.58982,
3412.58982, 3412.58982), regplan.2019.06.meanot.2018.06_amount = c(0,
0, 0, 0, 0, 1077.66, 1077.66, 1077.66, 1077.66, 1077.66),
meanot.2018.03 = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L), .Label = c("5.000000", "5.500000"), class = "factor"),
meanot.2018.04 = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L), .Label = c("6.000000", "8.000000"), class = "factor"),
meanot.2018.05 = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L), .Label = c("5.000000", "6.333333"), class = "factor"),
reg.voz.2019.03.reg.otgruz.план.шт.2019.03 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "0.00", class = "factor"),
reg.voz.2019.04.reg.otgruz.план.шт.2019.04 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "0.00", class = "factor"),
NewTT2019.03 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "0.000000", class = "factor"), NewTT2019.04 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "0.000000", class = "factor"),
NewTT2019.03_amount = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), NewTT2019.04_amount = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L)), .Names = c("cfo", "code", "customer_name",
"sales_volume_name", "shop_group_name", "product_id", "shop_code",
"rpost2019.01", "rpost2019.02", "rpost2019.03", "rpost2019.04",
"rpost2019.05", "rpost2019.06", "regplan.2019.01.meanot.2018.01_amount",
"regplan.2019.02.meanot.2018.02_amount", "regplan.2019.03.meanot.2018.03_amount",
"regplan.2019.04.meanot.2018.04_amount", "regplan.2019.05.meanot.2018.05_amount",
"regplan.2019.06.meanot.2018.06_amount", "meanot.2018.03", "meanot.2018.04",
"meanot.2018.05", "reg.voz.2019.03.reg.otgruz.план.шт.2019.03",
"reg.voz.2019.04.reg.otgruz.план.шт.2019.04", "NewTT2019.03",
"NewTT2019.04", "NewTT2019.03_amount", "NewTT2019.04_amount"), class = "data.frame", row.names = c(NA,
-10L))
-and the second table
anyta=structure(list(data = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L,
3L, 4L, 5L), .Label = c("2019-01-04 00:00:00.000", "2019-02-04 00:00:00.000",
"2019-03-04 00:00:00.000", "2019-04-04 00:00:00.000", "2019-05-04 00:00:00.000"
), class = "factor"), cfo = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = "НСК", class = "factor"), customer_name = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "TC", class = "factor"),
sales_volume_name = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "от 50 до 100 кг", class = "factor"),
shop_group_name = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "loc", class = "factor"), shop_code = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "02293НСК", class = "factor"),
product_id = c(11628L, 11628L, 11628L, 11628L, 11628L, 11709L,
11709L, 11709L, 11709L, 11709L), code = structure(c(1L, 2L,
3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c("PlanНСК1", "PlanНСК2",
"PlanНСК3", "PlanНСК4", "PlanНСК5"), class = "factor")), .Names = c("data",
"cfo", "customer_name", "sales_volume_name", "shop_group_name",
"shop_code", "product_id", "code"), class = "data.frame", row.names = c(NA,
-10L))
as we can see in the second table has data column , and the first table doesn't have data column, but it has int columns
rpost 2019-01, 2019-02....rpost2019-06
01...06 is number of months
01-jan
02-feb
03-mar
04-apr
05-may
06-jun
in second table data in ymd format, i must look only on number of months, here 04, it is april
so, how to do that if in second table in data column number of month=04
then then all numeric variables(begin from rpost) from first table should start from this month
I.E. output
cfo code customer_name sales_volume_name shop_group_name product_id shop_code rpost2019.04 rpost2019.05 rpost2019.06 regplan.2019.04.meanot.2018.04_amount
1 НСК PlanНСК1 ТС от 50 до 100 кг loc 11628 02293НСК 2 1 2 2529.60
2 НСК PlanНСК2 ТС от 50 до 100 кг loc 11628 02293НСК 2 1 2 2529.60
3 НСК PlanНСК3 ТС от 50 до 100 кг loc 11628 02293НСК 2 1 2 2529.60
4 НСК PlanНСК4 ТС от 50 до 100 кг loc 11628 02293НСК 2 1 2 2529.60
5 НСК PlanНСК5 ТС от 50 до 100 кг loc 11628 02293НСК 2 1 2 2529.60
6 НСК PlanНСК1 ТС от 50 до 100 кг loc 11709 02293НСК 2 3 1 2873.76
7 НСК PlanНСК2 ТС от 50 до 100 кг loc 11709 02293НСК 2 3 1 2873.76
8 НСК PlanНСК3 ТС от 50 до 100 кг loc 11709 02293НСК 2 3 1 2873.76
9 НСК PlanНСК4 ТС от 50 до 100 кг loc 11709 02293НСК 2 3 1 2873.76
10 НСК PlanНСК5 ТС от 50 до 100 кг loc 11709 02293НСК 2 3 1 2873.76
regplan.2019.05.meanot.2018.05_amount regplan.2019.06.meanot.2018.06_amount X.
1 1054.00 0.00 …
2 1054.00 0.00 …
3 1054.00 0.00 …
4 1054.00 0.00 …
5 1054.00 0.00 …
6 3412.59 1077.66 …
7 3412.59 1077.66 …
8 3412.59 1077.66 …
9 3412.59 1077.66 …
10 3412.59 1077.66 …
So in output 01-03 months are absent.
if in table anyta, in data column , 05 is indicated(MAY)
so return columns where 05 month is featured.
In data column of second table can't be different month
But notice, that in table itogo names of column maybe different
regplan 2019-01*meanot 2018-01_amount
regplan 2019-02*meanot 2018-02_amount
regplan 2019-03*meanot 2018-03_amount
regplan 2019-04*meanot 2018-04_amount
regplan 2019-05*meanot 2018-05_amount
regplan 2019-06*meanot 2018-06_amount
meanot 2018-03
meanot 2018-04
meanot 2018-05
reg voz 2019-03/reg otgruz план шт 2019-03
reg voz 2019-04/reg otgruz план шт 2019-04
NewTT2019-03
NewTT2019-04
NewTT2019-03_amount
NewTT2019-04_amount
if we have two data in col.names like
regplan 2019-01*meanot 2018-01_amount
so output from regplan 2019-04***meanot 2018-04**_amount
NewTT2019-03_amount
in this data column output from NewTT2019-04_amount
How to create such condition

Indicator feature creation in R based on multiple columns

I have a dataset with 10 columns and out of them 10, 3 are of interest to create a new indicator feature. The features are "pT", "pN", & "M" and they all take different values. Off all the values that these 3 features take, there are a toal of 9 unique combinations that needs to be captures in the new variable.
PATHOT PATHON PATHOM
1 pT2 pN1 M0
4 pT1 pN1 M0
13 pT3 pN1 M0
161 pT1 *pN2 M0
391 pT1 pN1 *M1
810 *pTIS pN1 M0
948 pT3 *pN2 M0
1043 pT2 pN1 *M1
1067 *pT4 pN1 M0
For example, the new variable will have value "1" when PATHOT=pT2, PATHON=pN1 & PATHOM=M0 and so on upto value 9. I have completed the task but after spending almost 20 lines of code involving vectorised operation for all unique combinations.
diag3_bs$sfd[diag3_bs$pathot=="pT2" & diag3_bs$pathon=="pN1" &
diag3_bs$pathom=="M0"] <- 1
diag3_bs$sfd[diag3_bs$pathot=="pT1" & diag3_bs$pathon=="pN1" &
diag3_bs$pathom=="M0"] <- 2
diag3_bs$sfd[diag3_bs$pathot=="pT3" & diag3_bs$pathon=="pN1" &
diag3_bs$pathom=="M0"] <- 3... so on upto 9.
I want to ask if there is a better more automated way of getting the same result?
dput(data.frame) is given below
structure(list(F_STATUS = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "Y", class = "factor"), EVENT_ID = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "BASELINE", class =
"factor"),
PAG_NAME = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = "BR2", class = "factor"), PTSIZE = c(3, 4,
2.7, 2, 0.9, 3, 3, 0.9, 3, 4.5), PTSIZE_U = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "CM", class = "factor"),
PT_SYM = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("", "-", "<", ">"), class = "factor"), PATHOT = structure(c(4L,
4L, 4L, 3L, 3L, 4L, 4L, 3L, 4L, 4L), .Label = c("*pT4", "*pTIS",
"pT1", "pT2", "pT3"), class = "factor"), PATHON = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("*pN2", "pN1"
), class = "factor"), PATHOM = structure(c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L), .Label = c("*M1", "M0"), class = "factor"),
RSUBJID = 901000:901009, RUSUBJID = structure(1:10, .Label = c(
"000301-000-901-251", "000301-000-901-252", "000301-000-901-253",
"000301-000-901-254", "000301-000-901-255", "000301-000-901-256",
"000301-000-901-257", "000301-000-901-258", "000301-000-901-259",
"000301-000-901-260", "000301-000-901-261", "000301-000-901-262")
, class = "factor")), .Names = c("F_STATUS", "EVENT_ID", "PAG_NAME", "PTSIZE", "PTSIZE_U", "PT_SYM", "PATHOT",
"PATHON", "PATHOM", "RSUBJID", "RUSUBJID"), row.names = c(NA, 10L),
class = "data.frame")
Thanks.
I tried to edit the data so it didn't throw an error on input. Also created a version of that tabulation of possible combinations:
stg_tbl <- structure(list(PATHOT = structure(c(4L, 3L, 5L, 3L, 3L, 2L, 5L,
4L, 1L), .Label = c("*pT4", "*pTIS", "pT1", "pT2", "pT3"), class = "factor"),
PATHON = structure(c(2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L), .Label = c("*pN2",
"pN1"), class = "factor"), PATHOM = structure(c(2L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L), .Label = c("*M1", "M0"), class = "factor")), .Names = c("PATHOT",
"PATHON", "PATHOM"), class = "data.frame", row.names = c("1",
"4", "13", "161", "391", "810", "948", "1043", "1067"))
Make a vector of text-equivalents of the categories:
stg_lbls <- with(stg_tbl, paste(PATHOT, PATHON, PATHOM, sep="_") )
Then the as.numeric values of a factor created using those levels will be the desired result:
dat$stg <- with(dat, factor( paste(PATHOT, PATHON, PATHOM, sep="_"), levels=stg_lbls))
as.numeric(dat$stg)
#[1] 1 1 1 2 2 1 1 2 1 1
You can just assign those values in the usual way:
dat$sfd <- as.numeric(dat$stg)
I made some new data, that should be useful for your problem.
k<-expand.grid(data.frame(a=letters[1:3],b=letters[4:6],c=letters[7:9]))
library(dplyr)
k %>% mutate(groups=paste0(a,b,c))->k2
k2$groups<-as.numeric(factor(k2$groups))
k2
It's crude, and you're not picking which combination get's which numbers, so it'd take some digging afterwards, but it's quick.

Resources