Group by sum specific column in R - r

df <- data.frame(items=sample(LETTERS,replace= T),quantity=sample(1:100,26,replace=FALSE),price=sample(100:1000,26,replace=FALSE))
I want to group_by sum quantity is about 500(ballpark) ,
When count close about 500 put the same group,like below
Any help would be appreciated.
Updated
Because the condition need to change, I reset the threshold to 250,
I summarize to find the max total value for each group, and then,
How could I change the the total of group6 < 200 , into group5.
I think about using ifelse but can't work successfully.
set.seed(123)
df <- data.frame(items=sample(LETTERS,replace= T),quantity=sample(1:100,26,replace=FALSE),price=sample(100:1000,26,replace=FALSE))
df$group=cumsum(c(1,ifelse(diff(cumsum(df$quantity)%% 250) < 0,1,0)))
df$total=ave(df$quantity,df$group,FUN=cumsum)
df %>% group_by(group) %>% summarise(max = max(total, na.rm=TRUE))
# A tibble: 6 × 2
group max
<dbl> <int>
1 1 238
2 2 254
3 3 256
4 4 246
5 5 237
6 6 101
I want get like
> df
items quantity price group total
1 O 36 393 1 36
2 S 78 376 1 114
3 N 81 562 1 195
4 C 43 140 1 238
5 J 76 530 2 76
6 R 15 189 2 91
7 V 32 415 2 123
8 K 7 322 2 130
9 E 9 627 2 139
10 T 41 215 2 180
11 N 74 705 2 254
12 V 23 873 3 23
13 Y 27 846 3 50
14 Z 60 555 3 110
15 E 53 697 3 163
16 S 93 953 3 256
17 Y 86 138 4 86
18 Y 88 258 4 174
19 I 38 851 4 212
20 C 34 308 4 246
21 H 69 473 5 69
22 Z 72 917 5 141
23 G 96 133 5 237
24 J 63 615 5 300
25 I 13 112 5 376
26 S 25 168 5 477
Thank you for any helping all the time.

Base R
set.seed(123)
df <- data.frame(items=sample(LETTERS,replace= T),quantity=sample(1:100,26,replace=FALSE),price=sample(100:1000,26,replace=FALSE))
df$group=cumsum(c(1,ifelse(diff(cumsum(df$quantity)%%500)<0,1,0)))
df$total=ave(df$quantity,df$group,FUN=cumsum)
items quantity price group total
1 O 36 393 1 36
2 S 78 376 1 114
3 N 81 562 1 195
4 C 43 140 1 238
5 J 76 530 1 314
6 R 15 189 1 329
7 V 32 415 1 361
8 K 7 322 1 368
9 E 9 627 1 377
10 T 41 215 1 418
11 N 74 705 1 492
12 V 23 873 2 23
13 Y 27 846 2 50
14 Z 60 555 2 110
15 E 53 697 2 163
16 S 93 953 2 256
17 Y 86 138 2 342
18 Y 88 258 2 430
19 I 38 851 2 468
20 C 34 308 2 502
21 H 69 473 3 69
22 Z 72 917 3 141
23 G 96 133 3 237
24 J 63 615 3 300
25 I 13 112 3 313
26 S 25 168 3 338

You could use Reduce(..., accumulate = TRUE) to find where the first cumulative quantity >= 500.
set.seed(123)
df <- data.frame(items=sample(LETTERS,replace= T),quantity=sample(1:100,26,replace=FALSE),price=sample(100:1000,26,replace=FALSE))
library(dplyr)
df %>%
group_by(group = lag(cumsum(Reduce(\(x, y) {
z <- x + y
if(z < 500) z else 0
}, quantity, accumulate = TRUE) == 0) + 1, default = 1)) %>%
mutate(total = sum(quantity)) %>%
ungroup()
# A tibble: 26 × 5
items quantity price group total
<chr> <int> <int> <dbl> <int>
1 O 36 393 1 515
2 S 78 376 1 515
3 N 81 562 1 515
4 C 43 140 1 515
5 J 76 530 1 515
6 R 15 189 1 515
7 V 32 415 1 515
8 K 7 322 1 515
9 E 9 627 1 515
10 T 41 215 1 515
11 N 74 705 1 515
12 V 23 873 1 515
13 Y 27 846 2 548
14 Z 60 555 2 548
15 E 53 697 2 548
16 S 93 953 2 548
17 Y 86 138 2 548
18 Y 88 258 2 548
19 I 38 851 2 548
20 C 34 308 2 548
21 H 69 473 2 548
22 Z 72 917 3 269
23 G 96 133 3 269
24 J 63 615 3 269
25 I 13 112 3 269
26 S 25 168 3 269

Here is a base R solution. The groups break after the cumulative sum passes a threshold. The output of aggregate shows that all cumulative sums are above thres except for the last one.
set.seed(2022)
df <- data.frame(items=sample(LETTERS,replace= T),
quantity=sample(1:100,26,replace=FALSE),
price=sample(100:1000,26,replace=FALSE))
f <- function(x, thres) {
grp <- integer(length(x))
run <- 0
current_grp <- 0L
for(i in seq_along(x)) {
run <- run + x[i]
grp[i] <- current_grp
if(run > thres) {
current_grp <- current_grp + 1L
run <- 0
}
}
grp
}
thres <- 500
group <- f(df$quantity, thres)
aggregate(quantity ~ group, df, sum)
#> group quantity
#> 1 0 552
#> 2 1 513
#> 3 2 214
ave(df$quantity, group, FUN = cumsum)
#> [1] 70 133 155 224 235 327 347 409 481 484 552 29 95 129 224 263 294 377 433
#> [20] 434 453 513 50 91 182 214
Created on 2022-09-06 by the reprex package (v2.0.1)
Edit
To assign groups and total quantities to the data can be done as follows.
df$group <- f(df$quantity, thres)
df$total_quantity <- ave(df$quantity, df$group, FUN = cumsum)
head(df)
#> items quantity price group total_quantity
#> 1 D 70 731 0 70
#> 2 S 63 516 0 133
#> 3 N 22 710 0 155
#> 4 W 69 829 0 224
#> 5 K 11 887 0 235
#> 6 D 92 317 0 327
Created on 2022-09-06 by the reprex package (v2.0.1)
Edit 2
To assign only the total quantity per group use sum instead of cumsum.
df$total_quantity <- ave(df$quantity, df$group, FUN = sum)

Related

Twin primes less than 1000 in r

I have the question: Construct a list of all twin primes less than 1000
So far my code is:
isPrime <- function (n ) n==2L || all (n %% 2L:max (2, floor(sqrt(n)))!=0)
Im having trouble constructing the actual list itself, any suggestions?
You could use the sapply command for getting your primes and then with the diff function the pairs
(Thanks Rui for pointing out that sapply is more suited than lapply here!)
testThese <- 1:1000
primes <- testThese[sapply(testThese,isPrime)]
pairs.temp <- which(diff(primes)==2)
pairs <- sort(c(pairs.temp, pairs.temp+1))
matrix(primes[pairs], ncol=2, byrow=TRUE)
[,1] [,2]
[1,] 3 5
[2,] 5 7
[3,] 11 13
[4,] 17 19
[5,] 29 31
... ... ...
Here is a solution using the Sieve of Eratosthenes:
E <- rep(TRUE, 1000)
E[1] <- FALSE
for (i in 2:33) {
if (!E[i]) next
E[seq(i+i, 1000, i)] <- FALSE
}
P <- which(E) ## primes
pp <- which(diff(P)==2) ## index of the first twin
cbind(P[pp], P[pp+1]) ## the twins
If you need a function isPrime() you can do:
isPrime <- function(i) E[i]
isPrime(c(1,2,4,5)) ## Test
Here is how you can construct (not very efficiently though) a list of primes using your function:
primes_list <- vector(length = 0, mode = "integer")
for (i in 1:1000) {
if (isPrime(i)) primes_list <- c(primes_list, i)
}
You should be able to extend that to sorting out the twin primes.
How about the following?
library(gmp)
library(dplyr)
df <- expand.grid(x = 1:1000)
df$y <- isprime(df$x)
df <- df[df$y == 2,]
df[c(0,diff(df$x)) == 2 | lead(c(0,diff(df$x)) == 2, 1, F),]
x y
3 3 2
5 5 2
7 7 2
11 11 2
13 13 2
17 17 2
19 19 2
29 29 2
31 31 2
41 41 2
43 43 2
59 59 2
61 61 2
71 71 2
73 73 2
101 101 2
103 103 2
107 107 2
109 109 2
137 137 2
139 139 2
149 149 2
151 151 2
179 179 2
181 181 2
191 191 2
193 193 2
197 197 2
199 199 2
227 227 2
229 229 2
239 239 2
241 241 2
269 269 2
271 271 2
281 281 2
283 283 2
311 311 2
313 313 2
347 347 2
349 349 2
419 419 2
421 421 2
431 431 2
433 433 2
461 461 2
463 463 2
521 521 2
523 523 2
569 569 2
571 571 2
599 599 2
601 601 2
617 617 2
619 619 2
641 641 2
643 643 2
659 659 2
661 661 2
809 809 2
811 811 2
821 821 2
823 823 2
827 827 2
829 829 2
857 857 2
859 859 2
881 881 2
883 883 2

Convert non-numeric rows and columns to zero

I have this data from an r package, where X is the dataset with all the data
library(ISLR)
data("Hitters")
X=Hitters
head(X)
here is one part of the data:
AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI CWalks League Division PutOuts Assists Errors Salary NewLeague
-Andy Allanson 293 66 1 30 29 14 1 293 66 1 30 29 14 A E 446 33 20 NA A
-Alan Ashby 315 81 7 24 38 39 14 3449 835 69 321 414 375 N W 632 43 10 475.0 N
-Alvin Davis 479 130 18 66 72 76 3 1624 457 63 224 266 263 A W 880 82 14 480.0 A
-Andre Dawson 496 141 20 65 78 37 11 5628 1575 225 828 838 354 N E 200 11 3 500.0 N
-Andres Galarraga 321 87 10 39 42 30 2 396 101 12 48 46 33 N E 805 40 4 91.5 N
-Alfredo Griffin 594 169 4 74 51 35 11 4408 1133 19 501 336 194 A W 282 421 25 750.0 A
I want to convert all the columns and the rows with non numeric values to zero, is there any simple way to do this.
I found here an example how to remove the rows for one column just but for more I have to do it for every column manually.
Is in r any function that does this for all columns and rows?
To remove non-numeric columns, perhaps something like this?
df %>%
select(which(sapply(., is.numeric)))
# AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun
#-Andy Allanson 293 66 1 30 29 14 1 293 66 1
#-Alan Ashby 315 81 7 24 38 39 14 3449 835 69
#-Alvin Davis 479 130 18 66 72 76 3 1624 457 63
#-Andre Dawson 496 141 20 65 78 37 11 5628 1575 225
#-Andres Galarraga 321 87 10 39 42 30 2 396 101 12
#-Alfredo Griffin 594 169 4 74 51 35 11 4408 1133 19
# CRuns CRBI CWalks PutOuts Assists Errors Salary
#-Andy Allanson 30 29 14 446 33 20 NA
#-Alan Ashby 321 414 375 632 43 10 475.0
#-Alvin Davis 224 266 263 880 82 14 480.0
#-Andre Dawson 828 838 354 200 11 3 500.0
#-Andres Galarraga 48 46 33 805 40 4 91.5
#-Alfredo Griffin 501 336 194 282 421 25 750.0
or
df %>%
select(-which(sapply(., function(x) is.character(x) | is.factor(x))))
Or much neater (thanks to #AntoniosK):
df %>% select_if(is.numeric)
Update
To additionally replace NAs with 0, you can do
df %>% select_if(is.numeric) %>% replace(is.na(.), 0)
# AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun
#-Andy Allanson 293 66 1 30 29 14 1 293 66 1
#-Alan Ashby 315 81 7 24 38 39 14 3449 835 69
#-Alvin Davis 479 130 18 66 72 76 3 1624 457 63
#-Andre Dawson 496 141 20 65 78 37 11 5628 1575 225
#-Andres Galarraga 321 87 10 39 42 30 2 396 101 12
#-Alfredo Griffin 594 169 4 74 51 35 11 4408 1133 19
# CRuns CRBI CWalks PutOuts Assists Errors Salary
#-Andy Allanson 30 29 14 446 33 20 0.0
#-Alan Ashby 321 414 375 632 43 10 475.0
#-Alvin Davis 224 266 263 880 82 14 480.0
#-Andre Dawson 828 838 354 200 11 3 500.0
#-Andres Galarraga 48 46 33 805 40 4 91.5
#-Alfredo Griffin 501 336 194 282 421 25 750.0
library(ISLR)
data("Hitters")
d = head(Hitters)
library(dplyr)
d %>%
mutate_if(function(x) !is.numeric(x), function(x) 0) %>% # if column is non numeric add zeros
mutate_all(function(x) ifelse(is.na(x), 0, x)) # if there is an NA element replace it with 0
# AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI CWalks League Division PutOuts Assists Errors Salary NewLeague
# 1 293 66 1 30 29 14 1 293 66 1 30 29 14 0 0 446 33 20 0.0 0
# 2 315 81 7 24 38 39 14 3449 835 69 321 414 375 0 0 632 43 10 475.0 0
# 3 479 130 18 66 72 76 3 1624 457 63 224 266 263 0 0 880 82 14 480.0 0
# 4 496 141 20 65 78 37 11 5628 1575 225 828 838 354 0 0 200 11 3 500.0 0
# 5 321 87 10 39 42 30 2 396 101 12 48 46 33 0 0 805 40 4 91.5 0
# 6 594 169 4 74 51 35 11 4408 1133 19 501 336 194 0 0 282 421 25 750.0 0
If you want to avoid function(x) you can use this
d %>%
mutate_if(Negate(is.numeric), ~0) %>%
mutate_all(~ifelse(is.na(.), 0, .))
You can get the numeric columns with sapply/inherits.
X <- Hitters
inx <- sapply(X, inherits, c("integer", "numeric"))
Y <- X[inx]
Then, it wouldn't make much sense to remove the rows with non-numeric entries, they were already removed, but you could do
inx <- apply(Y, 1, function(y) all(inherits(y, c("integer", "numeric"))))
Y[inx, ]

Applying function to every group in R

> head(m)
X id1 q_following topic_followed topic_answered nfollowers nfollowing
1 1 1 80 80 100 180 180
2 2 1 76 76 95 171 171
3 3 1 72 72 90 162 162
4 4 1 68 68 85 153 153
5 5 1 64 64 80 144 144
6 6 1 60 60 75 135 135
> head(d)
X id1 q_following topic_followed topic_answered nfollowers nfollowing
1 1 1 63 735 665 949 146
2 2 1 89 737 666 587 185
3 3 1 121 742 670 428 264
4 4 1 277 750 706 622 265
5 5 1 339 765 734 108 294
6 6 1 363 767 766 291 427
matcher <- function(x,y){ return(na.omit(m[which(d[,y]==x),y])) }
max_matcher <- function(x) { return(sum(matcher(x,3:13))) }
result <- foreach(1:1000, function(x) {
if(max(max_matcher(1:1000)) == max_matcher(x)) return(x)
})
I want to compute result across each group, grouped by id1 of dataframe m.
m %>% group_by(id1) %>% summarise(result) #doesn't work
by(m, m[,"id1"], result) #doesn't work
How should I proceed?

dplyr Error: cannot modify grouping variable even when first applying ungroup

I'm getting this error but the fixes in related posts don't seem to apply I'm using ungroup, though it's no longer needed (can I switch the grouping variable in a single dplyr statement? but see Format column within dplyr chain). Also I have no quotes in my group_by call and I'm not applying any functions that act on the grouped-by columns (R dplyr summarize_each --> "Error: cannot modify grouping variable") but I'm still getting this error:
> games2 = baseball %>%
+ ungroup %>%
+ group_by(id, year) %>%
+ summarize(total=g+ab, a = ab+1, id = id)%>%
+ arrange(desc(total)) %>%
+ head(10)
Error: cannot modify grouping variable
This is the baseball set that comes with plyr:
id year stint team lg g ab r h X2b X3b hr rbi sb cs bb so ibb hbp sh sf gidp
4 ansonca01 1871 1 RC1 25 120 29 39 11 3 0 16 6 2 2 1 NA NA NA NA NA
44 forceda01 1871 1 WS3 32 162 45 45 9 4 0 29 8 0 4 0 NA NA NA NA NA
68 mathebo01 1871 1 FW1 19 89 15 24 3 1 0 10 2 1 2 0 NA NA NA NA NA
99 startjo01 1871 1 NY2 33 161 35 58 5 1 1 34 4 2 3 0 NA NA NA NA NA
102 suttoez01 1871 1 CL1 29 128 35 45 3 7 3 23 3 1 1 0 NA NA NA NA NA
106 whitede01 1871 1 CL1 29 146 40 47 6 5 1 21 2 2 4 1 NA NA NA NA NA
I loaded plyr before dplyr. Other bugs to check for? Thanks for any corrections/suggestions.
Not clear what you are doing. I think following is what you are looking for:
games2 = baseball %>%
group_by(id, year) %>%
mutate(total=g+ab, a = ab+1)%>%
arrange(desc(total)) %>%
head(10)
> games2
Source: local data frame [10 x 24]
Groups: id, year
id year stint team lg g ab r h X2b X3b hr rbi sb cs bb so ibb hbp sh sf gidp total a
1 aaronha01 1954 1 ML1 NL 122 468 58 131 27 6 13 69 2 2 28 39 NA 3 6 4 13 590 469
2 aaronha01 1955 1 ML1 NL 153 602 105 189 37 9 27 106 3 1 49 61 5 3 7 4 20 755 603
3 aaronha01 1956 1 ML1 NL 153 609 106 200 34 14 26 92 2 4 37 54 6 2 5 7 21 762 610
4 aaronha01 1957 1 ML1 NL 151 615 118 198 27 6 44 132 1 1 57 58 15 0 0 3 13 766 616
5 aaronha01 1958 1 ML1 NL 153 601 109 196 34 4 30 95 4 1 59 49 16 1 0 3 21 754 602
6 aaronha01 1959 1 ML1 NL 154 629 116 223 46 7 39 123 8 0 51 54 17 4 0 9 19 783 630
7 aaronha01 1960 1 ML1 NL 153 590 102 172 20 11 40 126 16 7 60 63 13 2 0 12 8 743 591
8 aaronha01 1961 1 ML1 NL 155 603 115 197 39 10 34 120 21 9 56 64 20 2 1 9 16 758 604
9 aaronha01 1962 1 ML1 NL 156 592 127 191 28 6 45 128 15 7 66 73 14 3 0 6 14 748 593
10 aaronha01 1963 1 ML1 NL 161 631 121 201 29 4 44 130 31 5 78 94 18 0 0 5 11 792 632
The problem is that you are trying to edit id in the summarize call, but you have grouped on id.
From your example, it looks like you want mutate anyway. You would use summarize if you were looking to apply a function that would return a single value like sum or mean.
games2 = baseball %>%
dplyr::group_by(id, year) %>%
dplyr::mutate(
total = g + ab,
a = ab + 1
) %>%
dplyr::select(id, year, total, a) %>%
dplyr::arrange(desc(total)) %>%
head(10)
Source: local data frame [10 x 4]
Groups: id, year
id year total a
1 aaronha01 1954 590 469
2 aaronha01 1955 755 603
3 aaronha01 1956 762 610
4 aaronha01 1957 766 616
5 aaronha01 1958 754 602
6 aaronha01 1959 783 630
7 aaronha01 1960 743 591
8 aaronha01 1961 758 604
9 aaronha01 1962 748 593
10 aaronha01 1963 792 632

Add values to dataframe when condition met in R

I have a dataframe like this:
ID1 ID2 Position Grade Day
234 756 2 87 27
245 486 4 66 26
321 275 1 54 20
768 656 6 51 7
421 181 1 90 14
237 952 8 68 23
237 553 4 32 30
And I have another dataframe like this:
ID1 ID2 Day Count
234 756 2 3
245 486 2 1
209 706 2 1
124 554 2 2
237 553 2 4
I need to add the Counts to the first dataframe where the ID1, ID2 and Day are matched. However, I also need to have it so that if there is no match (no Counts in the second dataframe for the set of ID1, ID2 and Day in the first dataframe) then a zero is put in that place. So the final dataframe would be something like:
ID1 ID2 Position Grade Day Count
234 756 2 87 27 3
245 486 4 66 26 1
321 275 1 54 20 0
768 656 6 51 7 0
421 181 1 90 14 0
237 952 8 68 23 0
237 553 4 32 30 4
This can be useful
> # First, merge df1 and df2
> df3 <- merge(df1, df2, by=c("ID1", "ID2"), all.x=TRUE)
> # Replace NA with 0's
> transform(df3[, -6], Count=ifelse(is.na(Count), 0, Count))
ID1 ID2 Position Grade Day.x Count
1 234 756 2 87 27 3
2 237 553 4 32 30 4
3 237 952 8 68 23 0
4 245 486 4 66 26 1
5 321 275 1 54 20 0
6 421 181 1 90 14 0
7 768 656 6 51 7 0

Resources