Is there an efficient alternative to table()? - r

I use the following command:
table(factor("list",levels=1:"n")
with "list": (example) a = c(1,3,4,4,3)
and levels = 1:5, to also take the 2 and 5 into consideration.
For really big datasets, my code seems to be very ineffective.
Does anyone know a hidden library or a code snippet to make it faster?

We could use fnobs from collapse which would be efficient
library(collapse)
fnobs(df, g = df$X1)
In base R, tabulate is more efficient compared to table
tabulate(df$X1)
[1] 9 6 15 13 11 9 7 9 11 10

We could also use janitor::tabyl:
library(janitor)
df %>%
tabyl(X1) %>%
adorn_totals()
X1 n percent
1 9 0.09
2 6 0.06
3 15 0.15
4 13 0.13
5 11 0.11
6 9 0.09
7 7 0.07
8 9 0.09
9 11 0.11
10 10 0.10
Total 100 1.00

It's not exactly what you are looking for, but perhaps you can use this:
library(dplyr)
set.seed(8192)
df <- data.frame(X1 = sample(1:10, 100, replace = TRUE))
df %>%
count(X1)
returns
X1 n
1 1 9
2 2 6
3 3 15
4 4 13
5 5 11
6 6 9
7 7 7
8 8 9
9 9 11
10 10 10
If you need to count more numbers (including missing ones), you could use
library(tidyr)
library(dplyr)
df2 <- data.frame(X1 = 1:12)
df %>%
count(X1) %>%
right_join(df2, by="X1") %>%
mutate(n = replace_na(n, 0L))
to get
X1 n
1 1 9
2 2 6
3 3 15
4 4 13
5 5 11
6 6 9
7 7 7
8 8 9
9 9 11
10 10 10
11 11 0
12 12 0

TL;DR the winner is base::tabulate.
Summing up, the base objective was a performance so I prepared a microbenchmark of all provided solutions. I use small and bigger vectors, two different scenerio. For collapse package on my machine I have to download the newest Rcpp package 1.0.7 (to suppress crashes). Even added by me Rcpp solution is slower than base::tabulate.
suppressMessages(library(janitor))
suppressMessages(library(collapse))
suppressMessages(library(dplyr))
suppressMessages(library(cpp11))
# source https://stackoverflow.com/questions/31001392/rcpp-version-of-tabulate-is-slower-where-is-this-from-how-to-understand
Rcpp::cppFunction('IntegerVector tabulate_rcpp(const IntegerVector& x, const unsigned max) {
IntegerVector counts(max);
for (auto& now : x) {
if (now > 0 && now <= max)
counts[now - 1]++;
}
return counts;
}')
set.seed(1234)
a = c(1,3,4,4,3)
levels = 1:5
df <- data.frame(X1 = a)
microbenchmark::microbenchmark(tabulate_rcpp = {tabulate_rcpp(df$X1, max(df$X1))},
base_table = {base::table(factor(df$X1, 1:max(df$X1)))},
stats_aggregate = {stats::aggregate(. ~ X1, cbind(df, n = 1), sum)},
graphics_hist = {hist(df$X1, plot = FALSE, right = FALSE)[c("breaks", "counts")]},
janitor_tably = {adorn_totals(tabyl(df, X1))},
collapse_fnobs = {fnobs(df, df$X1)},
base_tabulate = {tabulate(df$X1)},
dplyr_count = {count(df, X1)})
#> Unit: microseconds
#> expr min lq mean median uq max
#> tabulate_rcpp 2.959 5.9800 17.42326 7.9465 9.5435 883.561
#> base_table 48.524 59.5490 72.42985 66.3135 78.9320 153.216
#> stats_aggregate 829.324 891.7340 1069.86510 937.4070 1140.0345 2883.025
#> graphics_hist 148.561 170.5305 221.05290 188.9570 228.3160 958.619
#> janitor_tably 6005.490 6439.6870 8137.82606 7497.1985 8283.3670 53352.680
#> collapse_fnobs 14.591 21.9790 32.63891 27.2530 32.6465 417.987
#> base_tabulate 1.879 4.3310 5.68916 5.5990 6.6210 16.789
#> dplyr_count 1832.648 1969.8005 2546.17131 2350.0450 2560.3585 7210.992
#> neval
#> 100
#> 100
#> 100
#> 100
#> 100
#> 100
#> 100
#> 100
df <- data.frame(X1 = sample(1:5, 1000, replace = TRUE))
microbenchmark::microbenchmark(tabulate_rcpp = {tabulate_rcpp(df$X1, max(df$X1))},
base_table = {base::table(factor(df$X1, 1:max(df$X1)))},
stats_aggregate = {stats::aggregate(. ~ X1, cbind(df, n = 1), sum)},
graphics_hist = {hist(df$X1, plot = FALSE, right = FALSE)[c("breaks", "counts")]},
janitor_tably = {adorn_totals(tabyl(df, X1))},
collapse_fnobs = {fnobs(df, df$X1)},
base_tabulate = {tabulate(df$X1)},
dplyr_count = {count(df, X1)})
#> Unit: microseconds
#> expr min lq mean median uq max
#> tabulate_rcpp 4.847 8.8465 10.92661 10.3105 12.6785 28.407
#> base_table 83.736 107.2040 121.77962 118.8450 129.9560 184.427
#> stats_aggregate 1027.918 1155.9205 1338.27752 1246.6205 1434.8990 2085.821
#> graphics_hist 209.273 237.8265 274.60654 258.9260 300.3830 523.803
#> janitor_tably 5988.085 6497.9675 7833.34321 7593.3445 8422.6950 13759.142
#> collapse_fnobs 26.085 38.6440 51.89459 47.8250 57.3440 333.034
#> base_tabulate 4.501 6.7360 8.09408 8.2330 9.2170 11.463
#> dplyr_count 1852.290 2000.5225 2374.28205 2145.9835 2516.7940 4834.544
#> neval
#> 100
#> 100
#> 100
#> 100
#> 100
#> 100
#> 100
#> 100
Created on 2021-08-01 by the reprex package (v2.0.0)

A base R option using aggregate (borrowing df from #Martin Gal)
> aggregate(. ~ X1, cbind(df, n = 1), sum)
X1 n
1 1 9
2 2 6
3 3 15
4 4 13
5 5 11
6 6 9
7 7 7
8 8 9
9 9 11
10 10 10
Another option is using hist
> hist(df$X1, plot = FALSE, right = FALSE)[c("breaks", "counts")]
$breaks
[1] 1 2 3 4 5 6 7 8 9 10
$counts
[1] 9 6 15 13 11 9 7 9 21

Here is one more: summarytools
Data from Martin Gal! Many thanks:
library(summarytools)
set.seed(8192)
df <- data.frame(X1 = sample(1:10, 100, replace = TRUE))
summarytools::freq(df$X1, cumul=FALSE)
Output:
Freq % Valid % Total
----------- ------ --------- ---------
1 9 9.00 9.00
2 6 6.00 6.00
3 15 15.00 15.00
4 13 13.00 13.00
5 11 11.00 11.00
6 9 9.00 9.00
7 7 7.00 7.00
8 9 9.00 9.00
9 11 11.00 11.00
10 10 10.00 10.00
<NA> 0 0.00
Total 100 100.00 100.00

If a faster alternative to table() is required, including cross-tabulation, collapse::qtab(), available since v1.8.0 (May 2022) is a faithful and noticeably faster alternative. fcount() can also be used in the univariate case, and returns a data.frame.
library(collapse) # > v1.8.0, and > 1.9.0 for fcount()
library(microbenchmark)
v = sample(10000, 1e6, TRUE)
microbenchmark(qtab(v, sort = FALSE), fcount(v), tabulate(v), times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
qtab(v, sort = FALSE) 1.911707 1.945245 2.002473 1.963654 2.027942 2.207891 10
fcount(v) 1.885549 1.906746 1.978894 1.932310 2.103997 2.138027 10
tabulate(v) 2.321543 2.323716 2.333839 2.328206 2.334499 2.372506 10
v2 = sample(10000, 1e6, TRUE)
microbenchmark(qtab(v, v2), qtab(v, v2, sort = FALSE), table(v, v2), times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
qtab(v, v2) 45.61279 51.14840 74.16168 60.7761 72.86385 157.6501 10
qtab(v, v2, sort = FALSE) 41.30812 49.66355 57.02565 51.3568 54.69859 118.1289 10
table(v, v2) 281.60079 282.85273 292.48119 286.0535 288.19253 349.5513 10
That being said, tabulate() is pretty much as fast as it gets as far as C code is concerned. But it has a clear caveat, which is that it does not hash the values at all, but determines the maximum value and allocates a results vector of that length, using it as a table to count values. Consider this:
v[10] = 1e7L # Adding a random large value here
length(tabulate(v))
[1] 10000000
length(table(v))
[1] 10001
length(qtab(v))
[1] 10001
So you get a results vector with 6.99 million zeros, and your performance deteriorates
microbenchmark(qtab(v, sort = FALSE), fcount(v), tabulate(v), times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
qtab(v, sort = FALSE) 1.873249 1.900473 1.966721 1.923064 2.064186 2.126588 10
fcount(v) 1.829338 1.850330 1.926676 1.880199 2.021013 2.057667 10
tabulate(v) 4.207789 4.357439 5.066296 4.417012 4.558216 10.347744 10
In light of this, the fact that qtab() actually does hash every value and achieves this performance is rather remarkable.

Related

Possible to avoid a FOR loop in this very simple R code?

The answers below are very helpful. But I oversimplified my original question. I figured I learn more if I oversimplify and then adapt to my actual need, but now I am stuck. There are other factors that drive the amortization. See more complete code here. I like the response using "amort$end_bal <- begin_bal * (1 - mpr)^amort$period" and "amort$pmt <- c(0, diff(amort$end_bal))* -1", but in addition npr increases the ending balances and ch_off decreases ending balances. Here´s the more complete code:
n_periods <- 8
begin_bal <- 10000
yld <- .20
npr <- .09
mpr <- .10
co <- .10
period = seq(0,n_periods,1)
fin = 0
pur = 0
pmt = 0
ch_off = 0
end_bal = begin_bal
for(i in 1:n_periods){
{fin[i+1] = end_bal[i]*yld/12}
{pur[i+1] = end_bal[i]*npr}
{pmt[i+1] = end_bal[i]*mpr}
{ch_off[i+1] = end_bal[i]*co/12}
end_bal[i+1] = end_bal[i]+pur[i+1]-pmt[i+1]-ch_off[i+1]}
amort <- data.frame(period,fin,pur,pmt,ch_off,end_bal)
Which gives the below correct output:
print(amort,row.names=FALSE)
period fin pur pmt ch_off end_bal
0 0.0000 0.0000 0.0000 0.00000 10000.000
1 166.6667 900.0000 1000.0000 83.33333 9816.667
2 163.6111 883.5000 981.6667 81.80556 9636.694
3 160.6116 867.3025 963.6694 80.30579 9460.022
4 157.6670 851.4020 946.0022 78.83351 9286.588
5 154.7765 835.7929 928.6588 77.38823 9116.334
6 151.9389 820.4700 911.6334 75.96945 8949.201
7 149.1534 805.4281 894.9201 74.57668 8785.132
8 146.4189 790.6619 878.5132 73.20944 8624.072
I´m new to R, and I understand one of its features is matrix/vector manipulation. In the below example I amortize an asset over 8 months, where each payment ("pmt") is 10% ("mpr") of the prior period balance ("end_bal"). The below works fine. I used a FOR loop. I understand FOR loops can be slow in large models and a better solution is use of R´s abundant vector/matrix functions. But I didn´t know how to do this in my example since each monthly payment is calculated by referencing the prior period ending balance.
So my questions are:
Is there a more efficient way to do the below?
How do I replace the 0 for pmt in period 0, with an empty space?
R code:
n_periods <- 8
begin_bal <- 100
mpr <- .10
# Example loan amortization
pmt = 0
end_bal = begin_bal
for(i in 1:n_periods){
{pmt[i+1] = end_bal[i]*mpr}
end_bal[i+1] = end_bal[i]-pmt[i+1]}
amort <- data.frame(period = 0:n_periods,pmt,end_bal)
amort
Results, which are correct:
> amort
period pmt end_bal
1 0 0.000000 100.00000
2 1 10.000000 90.00000
3 2 9.000000 81.00000
4 3 8.100000 72.90000
5 4 7.290000 65.61000
6 5 6.561000 59.04900
7 6 5.904900 53.14410
8 7 5.314410 47.82969
9 8 4.782969 43.04672
Use R's vectorised calculations
n_periods <- 8
begin_bal <- 100
mpr <- .10
amort <- data.frame(period = seq(0, n_periods, 1))
amort$end_bal <- begin_bal * (1 - mpr)^amort$period
amort$pmt <- c(0, diff(amort$end_bal))* -1
amort
#> period end_bal pmt
#> 1 0 100.00000 0.000000
#> 2 1 90.00000 10.000000
#> 3 2 81.00000 9.000000
#> 4 3 72.90000 8.100000
#> 5 4 65.61000 7.290000
#> 6 5 59.04900 6.561000
#> 7 6 53.14410 5.904900
#> 8 7 47.82969 5.314410
#> 9 8 43.04672 4.782969
Created on 2021-05-12 by the reprex package (v2.0.0)
dplyr way for a different case (say)
n_periods <- 15
begin_bal <- 1000
mpr <- .07
library(dplyr)
seq(0, n_periods, 1) %>% as.data.frame() %>%
setNames('period') %>%
mutate(end_bal = begin_bal * (1 - mpr)^period,
pmt = -1 * c(0, diff(end_bal)))
#> period end_bal pmt
#> 1 0 1000.0000 0.00000
#> 2 1 930.0000 70.00000
#> 3 2 864.9000 65.10000
#> 4 3 804.3570 60.54300
#> 5 4 748.0520 56.30499
#> 6 5 695.6884 52.36364
#> 7 6 646.9902 48.69819
#> 8 7 601.7009 45.28931
#> 9 8 559.5818 42.11906
#> 10 9 520.4111 39.17073
#> 11 10 483.9823 36.42878
#> 12 11 450.1035 33.87876
#> 13 12 418.5963 31.50725
#> 14 13 389.2946 29.30174
#> 15 14 362.0439 27.25062
#> 16 15 336.7009 25.34308
Created on 2021-05-12 by the reprex package (v2.0.0)
Though OP has put another question in edited scenario, here's the approach suggested (for future reference)
n_periods <- 8
begin_bal <- 10000
yld <- .20
npr <- .09
mpr <- .10
co <- .10
library(dplyr)
seq(0, n_periods, 1) %>% as.data.frame() %>%
setNames('period') %>%
mutate(end_bal = begin_bal * (1 - (mpr + co/12 - npr))^period,
fin = c(0, (end_bal * yld/12)[-nrow(.)]),
pur = c(0, (end_bal * npr)[-nrow(.)]),
pmt = c(0, (end_bal * mpr)[-nrow(.)]),
ch_off = c(0, (end_bal * co/12)[-nrow(.)]))
#> period end_bal fin pur pmt ch_off
#> 1 0 10000.000 0.0000 0.0000 0.0000 0.00000
#> 2 1 9816.667 166.6667 900.0000 1000.0000 83.33333
#> 3 2 9636.694 163.6111 883.5000 981.6667 81.80556
#> 4 3 9460.022 160.6116 867.3025 963.6694 80.30579
#> 5 4 9286.588 157.6670 851.4020 946.0022 78.83351
#> 6 5 9116.334 154.7765 835.7929 928.6588 77.38823
#> 7 6 8949.201 151.9389 820.4700 911.6334 75.96945
#> 8 7 8785.132 149.1534 805.4281 894.9201 74.57668
#> 9 8 8624.072 146.4189 790.6619 878.5132 73.20944
Created on 2021-05-13 by the reprex package (v2.0.0)
If you are "lazy" (don't want to formulate the general expression of pmt and end_bal), you can define a recursive function f like blow
f <- function(k) {
if (k == 1) {
return(data.frame(pmt = 100 * mpr, end_bal = 100))
}
u <- f(k - 1)
end_bal <- with(tail(u, 1), end_bal - pmt)
pmt <- mpr * end_bal
rbind(u, data.frame(pmt, end_bal))
}
n_periods <- 8
res <- transform(
cbind(period = 0:n_periods, f(n_periods + 1)),
pmt = c(0, head(pmt, -1))
)
and you will see
> res
period pmt end_bal
1 0 0.000000 100.00000
2 1 10.000000 90.00000
3 2 9.000000 81.00000
4 3 8.100000 72.90000
5 4 7.290000 65.61000
6 5 6.561000 59.04900
7 6 5.904900 53.14410
8 7 5.314410 47.82969
9 8 4.782969 43.04672

How to reduce the if statement for multiple arguments?

I would like to reduce this code:
carro$custo_tprivate = with(carro, ifelse(decile_renda == 1,
renda_fa*0.116,
ifelse(decile_renda == 2, renda_fa*0.106,
ifelse(decile_renda == 3, renda_fa*0.102,
ifelse(decile_renda == 4, renda_fa*0.115,
ifelse(decile_renda == 5, renda_fa*0.124,
ifelse(decile_renda == 6, renda_fa*0.125,
ifelse(decile_renda == 7, renda_fa*0.137,
ifelse(decile_renda == 8, renda_fa*0.141,
ifelse(decile_renda == 9, renda_fa*0.156,
ifelse(decile_renda == 10, renda_fa*0.131, 0)))))))))))
Someone could teach me how to do that?
Thank you very much!
You can technically use match statement to reduce your ifelse statements as below
# put your case data into a data frame
data = data.frame(x= seq(1, 10, by = 1)
, y = runif(10))
# creating your actual data
carro = data.frame(decile_renda = sample(1:10, 10, replace =T)
,renda_fa = runif(10)
)
#Match it to get positions of case statement
pos = with(carro, match(decile_renda, data$x, 0))
# multiply to get results
data$y[pos]*carro$renda_fa
Alternatively, this can be solved by left-joining with a lookup table.
If I understand correctly the nested ifelse() construct, the OP wants to multiply renda with a factor which depends on the value of decile_renda. The factors are given for 10 distinct values of decile_renda. In all other cases, the result must be zero.
Using left join will find matching values of decile_renda in the lookup table. Non-matching rows in carro will get an NA value. These need to be replaced by zero, subsequently.
The lookup table treats the single use cases as data instead of hard-coding. This gives the flexibility to add or change the use cases without changing the code.
Create lookup table and test dataset
# create lookup table
lut <- data.frame(
decile_renda = 1:10,
fa = c(0.116, 0.106, 0.102, 0.115, 0.124, 0.125, 0.137, 0.141, 0.156, 0.131)
)
lut
decile_renda fa
1 1 0.116
2 2 0.106
3 3 0.102
4 4 0.115
5 5 0.124
6 6 0.125
7 7 0.137
8 8 0.141
9 9 0.156
10 10 0.131
# create test dataset
carro <- data.frame(decile_renda = 0:11, renda_fa = 100)
# randomize row order
set.seed(1L) # required for reproducible data
carro <- carro[sample(nrow(carro)), ]
carro
decile_renda renda_fa
9 8 100
4 3 100
7 6 100
1 0 100
2 1 100
5 4 100
3 2 100
8 7 100
6 5 100
11 10 100
12 11 100
10 9 100
Note that the test dataset has been choosen to allow for easy verification of the results.
Base R: merge()
carro <- merge(carro, lut, all.x = TRUE, by = "decile_renda")
carro$custo_tprivate <- with(carro, ifelse(is.na(fa), 0, renda_fa * fa))
carro
decile_renda renda_fa fa custo_tprivate
1 0 100 NA 0.0
2 1 100 0.116 11.6
3 2 100 0.106 10.6
4 3 100 0.102 10.2
5 4 100 0.115 11.5
6 5 100 0.124 12.4
7 6 100 0.125 12.5
8 7 100 0.137 13.7
9 8 100 0.141 14.1
10 9 100 0.156 15.6
11 10 100 0.131 13.1
12 11 100 NA 0.0
Note that the result column custo_tprivate shows 0.0 for the rows with non-matching decile_renda values of 0 and 11 as requested.
However, the drawback here is that merge() does not maintain the original row order (this is why the test dataset uses a random row order for demonstration). Also, the result contains the fa column which is no longer needed.
dplyr
library(dplyr)
carro %>%
left_join(lut, by = "decile_renda") %>%
mutate(custo_tprivate = if_else(is.na(fa), 0, renda_fa * fa)) %>%
select(-fa)
decile_renda renda_fa custo_tprivate
1 8 100 14.1
2 3 100 10.2
3 6 100 12.5
4 0 100 0.0
5 1 100 11.6
6 4 100 11.5
7 2 100 10.6
8 7 100 13.7
9 5 100 12.4
10 10 100 13.1
11 11 100 0.0
12 9 100 15.6
Here, the original row order is kept and the fa column has been removed.
data.table
With data.table we can do an update join where the matching rows of carro are being updated by reference, i.e., without copying the whole object. Only the result column custo_tprivate is appended to carro but not fa which would have to be removed afterwards. Also, for replacing the NA values only the affected rows are updated in place.
This might be an advantage in terms of speed and memory consumption in case of large datasets.
library(data.table)
setDT(carro)[lut, on = .(decile_renda), custo_tprivate := renda_fa * fa]
carro[is.na(custo_tprivate), custo_tprivate := 0]
carro
decile_renda renda_fa custo_tprivate
1: 8 100 14.1
2: 3 100 10.2
3: 6 100 12.5
4: 0 100 0.0
5: 1 100 11.6
6: 4 100 11.5
7: 2 100 10.6
8: 7 100 13.7
9: 5 100 12.4
10: 10 100 13.1
11: 11 100 0.0
12: 9 100 15.6

All values between min and max

Suppose I know the min and max id, what I need is to have all ids between the min and max ones. Suppose id<-c(1:20) now min=1 and max=20 which function in R show the all values between these two numbers?
You can use sets algebra:
id <- c(1:20)
setdiff(id, range(id))
#[1] 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
Also you can do:
id[!(id %in% range(id))]
or:
id[!(id %in% c(max(id), min(id)))]

Normalize/scale data set

I have the following data set:
dat<-as.data.frame(rbind(10,8,2,7,10,10,1,10,14,9,2,6,10,8,10,8,10,10,7,11,10))
colnames(dat)<-"Score"
print(dat)
Score
10
8
2
7
10
10
1
10
14
9
2
6
10
8
10
8
10
10
7
11
10
these are the test scores which students obtained, a student could get a maximum of 15 or a minimum of 0 in this test (by the way, nobody got the max or the min), however the lowest score obtained in this test was 1 and the highest was 14.
Now, I want to normalize/scale this data to the scale of 0 to 20.
How to achieve this in excel? or in R?
My final goal is to normalize the scores in this test to the above scale and to compare them with another set of data for which the max and min is 5 and 0 respectively.
How to compare these two different scaled data sets correctly against each other?
What I tried:
I went through many stuff on the internet, and came up with this:
which I got it from the wikipedia.
Is this method reliable?
In your case I would use the feature scale formula you posted on your question. The (x - min(x)) / (max(x) - min(x)) will essentially convert your test marks to the range between 0-1.
Since your edges are indeed 0 and 15 and not 2 and 14, your min(x)=0 and your max(x)=15. Once you have your marks between 0-1 using the above, you just multiply by 20.
i.e.
tests <- read.table(header=T, file='clipboard')
tests2 <- (tests - 0) / (15 - 0) #or equally tests / 15
And multiply by 20 to get marks between 0-20:
> tests2 * 20
Score
1 13.333333
2 10.666667
3 2.666667
4 9.333333
5 13.333333
6 13.333333
7 1.333333
8 13.333333
9 18.666667
10 12.000000
11 2.666667
12 8.000000
13 13.333333
14 10.666667
15 13.333333
16 10.666667
17 13.333333
18 13.333333
19 9.333333
20 14.666667
21 13.333333
The results are intuitive and the function is reliable. For example the person who scored 14/15 should get the highest mark (and very close to 20) which is the case here (after the transformation they scored 18.6666).
In Excel, if you want the normalized data to have a min of 0 and and max of 20, then we need to solve:
y = A * x + b
for two points.
Put the max of the raw data in C1:
=MAX(A:A)
Put the min of the raw data in C2:
=MIN(A:A)
Put the desired max in D1 and the desired min in D2. Put the formula for the A-coefficient in C3:
=($D$1-$D$2)/($C$1-$C$2)
and the formula for the B-coefficient in C4:
=$D$1-$C$3*$C$1
Finally put the scaling formula in B1:
=A1*$C$3+$C$4
and copy down:
Naturally, if you want the scaling to be independent of the raw max or min, you would use 15 in C1 and 0 in C2.
You can scale between 0 to 20 with this command in R:
newvalue <- 20/(max(score)-min(score))*(score-min(score))
The math way is fairly straightforward if the floor for all scales is 0.
new_value = new_ceiling * old_value / old_ceiling
The next formula will account for different floors on each scale:
new_value = new_floor + (new_ceiling - old_ceiling) * ((old_value-old_floor)/(old_ceiling-old_floor)) which is actually the formula you posted from Wikipedia. ;)
Hope this helps!
That is very simple. Due to the fact that both of those grades are linear, that a simple multiple ratio will do the work. Or in other word each grade in your set needs to be *20/15.
Here's a little r function which can help you run this if you need to repeat the operation and give you some flexibility on what you rescale to. Also one must be careful of NA values because min() and max() do not drop them by default which will then return NA. Therefore I provided an option on to handle NA values (drops them by default).
# function rescales data from 0 to 1 and optionally multiplies by new max
rescale <- function(x, new_max = 1, na.rm = T) {
as.vector(new_max * scale(x,
center = min(x, na.rm = na.rm),
scale = (max(x, na.rm = na.rm) - min(x, na.rm = na.rm))))
}
# old scores
scores <- c(10,8,2,7,10,10,1,10,14,9,2,6,10,8,10,8,10,10,7,11,10)
# new scores
data.frame(old = scores,
new = rescale(scores, new_max = 20))
#> old new
#> 1 10 13.846154
#> 2 8 10.769231
#> 3 2 1.538462
#> 4 7 9.230769
#> 5 10 13.846154
#> 6 10 13.846154
#> 7 1 0.000000
#> 8 10 13.846154
#> 9 14 20.000000
#> 10 9 12.307692
#> 11 2 1.538462
#> 12 6 7.692308
#> 13 10 13.846154
#> 14 8 10.769231
#> 15 10 13.846154
#> 16 8 10.769231
#> 17 10 13.846154
#> 18 10 13.846154
#> 19 7 9.230769
#> 20 11 15.384615
#> 21 10 13.846154
Created on 2022-03-10 by the reprex package (v2.0.1)

How to calculate mean with expending window

I have a data frame below. I wondered how to calculate the mean for column 'value_t' by expanding window starting from '2014-1-5'. e.g. val(1)=mean(1:5), value(2)=mean(1:6), value(3)=mean(1:7). I hope the algorithm is efficient (w/o loop).
df<-data.frame(date_t=paste('2014-01-',1:15,sep=""),value_t=1:15)
> df
date_t value_t
1 2014-01-1 1
2 2014-01-2 2
3 2014-01-3 3
4 2014-01-4 4
5 2014-01-5 5
6 2014-01-6 6
7 2014-01-7 7
8 2014-01-8 8
9 2014-01-9 9
10 2014-01-10 10
11 2014-01-11 11
12 2014-01-12 12
13 2014-01-13 13
14 2014-01-14 14
15 2014-01-15 15
How about sapply(5:NROW(df), function(.) mean(df$value_t[1:.]))? It involves kind of a loop (sapply) but it should be reasonable fast.
Have a look at this
df$val <- cumsum(df$value_t) / 1:nrow(df)
df$val[1:4] <- NA
# date_t value_t val
# 2014-01-1 1 NA
# 2014-01-2 2 NA
# 2014-01-3 3 NA
# 2014-01-4 4 NA
# 2014-01-5 5 3.0
# 2014-01-6 6 3.5
# 2014-01-7 7 4.0
# 2014-01-8 8 4.5
# 2014-01-9 9 5.0
# 2014-01-10 10 5.5
# 2014-01-11 11 6.0
# 2014-01-12 12 6.5
# 2014-01-13 13 7.0
# 2014-01-14 14 7.5
# 2014-01-15 15 8.0
If you just want the vector, and you don't want to tamper with df, do
val <- (cumsum(df$value_t) / 1:nrow(df))[-(1:4)]
# 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5 7.0 7.5 8.0
The sapply(...) solution is faster than the for(...) loop, but only just (about 2% - well within the margin of error). It turns out that extracting the column from the data frame at every step slows things down considerably. If you grab that column as a vector first, you get a ~25% improvement.
df <- data.frame(value=1:1e4)
f.sapply <- function() sapply(5:nrow(df), function(.) mean(df$value[1:.]))
f.loop <- function() {result <- numeric(nrow(df)-4)
for (i in 5:nrow(df)) result[i-4] <- mean(df$value[1:i])
result
}
f.vec <- function() {vec<-df$value
sapply(5:nrow(df), function(.) mean(vec[1:.]))
}
# do they produce identical results?
identical(f.sapply(),f.loop())
# [1] TRUE
identical(f.sapply(),f.vec())
# [1] TRUE
# which is faster?
library(microbenchmark)
microbenchmark(f.sapply(),f.loop(),f.vec())
# Unit: milliseconds
# expr min lq median uq max neval
# f.sapply() 904.2934 929.7361 947.7621 978.8775 1496.455 100
# f.loop() 927.5288 950.3632 963.5926 1012.2407 1347.889 100
# f.vec() 669.5615 697.3639 711.1498 751.2634 1060.056 100

Resources