Applying function to every group in R - r

> head(m)
X id1 q_following topic_followed topic_answered nfollowers nfollowing
1 1 1 80 80 100 180 180
2 2 1 76 76 95 171 171
3 3 1 72 72 90 162 162
4 4 1 68 68 85 153 153
5 5 1 64 64 80 144 144
6 6 1 60 60 75 135 135
> head(d)
X id1 q_following topic_followed topic_answered nfollowers nfollowing
1 1 1 63 735 665 949 146
2 2 1 89 737 666 587 185
3 3 1 121 742 670 428 264
4 4 1 277 750 706 622 265
5 5 1 339 765 734 108 294
6 6 1 363 767 766 291 427
matcher <- function(x,y){ return(na.omit(m[which(d[,y]==x),y])) }
max_matcher <- function(x) { return(sum(matcher(x,3:13))) }
result <- foreach(1:1000, function(x) {
if(max(max_matcher(1:1000)) == max_matcher(x)) return(x)
})
I want to compute result across each group, grouped by id1 of dataframe m.
m %>% group_by(id1) %>% summarise(result) #doesn't work
by(m, m[,"id1"], result) #doesn't work
How should I proceed?

Related

Group by sum specific column in R

df <- data.frame(items=sample(LETTERS,replace= T),quantity=sample(1:100,26,replace=FALSE),price=sample(100:1000,26,replace=FALSE))
I want to group_by sum quantity is about 500(ballpark) ,
When count close about 500 put the same group,like below
Any help would be appreciated.
Updated
Because the condition need to change, I reset the threshold to 250,
I summarize to find the max total value for each group, and then,
How could I change the the total of group6 < 200 , into group5.
I think about using ifelse but can't work successfully.
set.seed(123)
df <- data.frame(items=sample(LETTERS,replace= T),quantity=sample(1:100,26,replace=FALSE),price=sample(100:1000,26,replace=FALSE))
df$group=cumsum(c(1,ifelse(diff(cumsum(df$quantity)%% 250) < 0,1,0)))
df$total=ave(df$quantity,df$group,FUN=cumsum)
df %>% group_by(group) %>% summarise(max = max(total, na.rm=TRUE))
# A tibble: 6 × 2
group max
<dbl> <int>
1 1 238
2 2 254
3 3 256
4 4 246
5 5 237
6 6 101
I want get like
> df
items quantity price group total
1 O 36 393 1 36
2 S 78 376 1 114
3 N 81 562 1 195
4 C 43 140 1 238
5 J 76 530 2 76
6 R 15 189 2 91
7 V 32 415 2 123
8 K 7 322 2 130
9 E 9 627 2 139
10 T 41 215 2 180
11 N 74 705 2 254
12 V 23 873 3 23
13 Y 27 846 3 50
14 Z 60 555 3 110
15 E 53 697 3 163
16 S 93 953 3 256
17 Y 86 138 4 86
18 Y 88 258 4 174
19 I 38 851 4 212
20 C 34 308 4 246
21 H 69 473 5 69
22 Z 72 917 5 141
23 G 96 133 5 237
24 J 63 615 5 300
25 I 13 112 5 376
26 S 25 168 5 477
Thank you for any helping all the time.
Base R
set.seed(123)
df <- data.frame(items=sample(LETTERS,replace= T),quantity=sample(1:100,26,replace=FALSE),price=sample(100:1000,26,replace=FALSE))
df$group=cumsum(c(1,ifelse(diff(cumsum(df$quantity)%%500)<0,1,0)))
df$total=ave(df$quantity,df$group,FUN=cumsum)
items quantity price group total
1 O 36 393 1 36
2 S 78 376 1 114
3 N 81 562 1 195
4 C 43 140 1 238
5 J 76 530 1 314
6 R 15 189 1 329
7 V 32 415 1 361
8 K 7 322 1 368
9 E 9 627 1 377
10 T 41 215 1 418
11 N 74 705 1 492
12 V 23 873 2 23
13 Y 27 846 2 50
14 Z 60 555 2 110
15 E 53 697 2 163
16 S 93 953 2 256
17 Y 86 138 2 342
18 Y 88 258 2 430
19 I 38 851 2 468
20 C 34 308 2 502
21 H 69 473 3 69
22 Z 72 917 3 141
23 G 96 133 3 237
24 J 63 615 3 300
25 I 13 112 3 313
26 S 25 168 3 338
You could use Reduce(..., accumulate = TRUE) to find where the first cumulative quantity >= 500.
set.seed(123)
df <- data.frame(items=sample(LETTERS,replace= T),quantity=sample(1:100,26,replace=FALSE),price=sample(100:1000,26,replace=FALSE))
library(dplyr)
df %>%
group_by(group = lag(cumsum(Reduce(\(x, y) {
z <- x + y
if(z < 500) z else 0
}, quantity, accumulate = TRUE) == 0) + 1, default = 1)) %>%
mutate(total = sum(quantity)) %>%
ungroup()
# A tibble: 26 × 5
items quantity price group total
<chr> <int> <int> <dbl> <int>
1 O 36 393 1 515
2 S 78 376 1 515
3 N 81 562 1 515
4 C 43 140 1 515
5 J 76 530 1 515
6 R 15 189 1 515
7 V 32 415 1 515
8 K 7 322 1 515
9 E 9 627 1 515
10 T 41 215 1 515
11 N 74 705 1 515
12 V 23 873 1 515
13 Y 27 846 2 548
14 Z 60 555 2 548
15 E 53 697 2 548
16 S 93 953 2 548
17 Y 86 138 2 548
18 Y 88 258 2 548
19 I 38 851 2 548
20 C 34 308 2 548
21 H 69 473 2 548
22 Z 72 917 3 269
23 G 96 133 3 269
24 J 63 615 3 269
25 I 13 112 3 269
26 S 25 168 3 269
Here is a base R solution. The groups break after the cumulative sum passes a threshold. The output of aggregate shows that all cumulative sums are above thres except for the last one.
set.seed(2022)
df <- data.frame(items=sample(LETTERS,replace= T),
quantity=sample(1:100,26,replace=FALSE),
price=sample(100:1000,26,replace=FALSE))
f <- function(x, thres) {
grp <- integer(length(x))
run <- 0
current_grp <- 0L
for(i in seq_along(x)) {
run <- run + x[i]
grp[i] <- current_grp
if(run > thres) {
current_grp <- current_grp + 1L
run <- 0
}
}
grp
}
thres <- 500
group <- f(df$quantity, thres)
aggregate(quantity ~ group, df, sum)
#> group quantity
#> 1 0 552
#> 2 1 513
#> 3 2 214
ave(df$quantity, group, FUN = cumsum)
#> [1] 70 133 155 224 235 327 347 409 481 484 552 29 95 129 224 263 294 377 433
#> [20] 434 453 513 50 91 182 214
Created on 2022-09-06 by the reprex package (v2.0.1)
Edit
To assign groups and total quantities to the data can be done as follows.
df$group <- f(df$quantity, thres)
df$total_quantity <- ave(df$quantity, df$group, FUN = cumsum)
head(df)
#> items quantity price group total_quantity
#> 1 D 70 731 0 70
#> 2 S 63 516 0 133
#> 3 N 22 710 0 155
#> 4 W 69 829 0 224
#> 5 K 11 887 0 235
#> 6 D 92 317 0 327
Created on 2022-09-06 by the reprex package (v2.0.1)
Edit 2
To assign only the total quantity per group use sum instead of cumsum.
df$total_quantity <- ave(df$quantity, df$group, FUN = sum)

Reindexing a column in R

I'm dealing with the following dataset
animal protein herd sire dam
6 416 189.29 2 15 236
7 417 183.27 2 6 295
9 419 193.24 3 11 268
10 420 198.84 2 12 295
11 421 205.25 3 3 251
12 422 204.15 2 2 281
13 423 200.20 2 3 248
14 424 197.22 2 11 222
15 425 201.14 1 10 262
17 427 196.20 1 11 290
18 428 208.13 3 9 294
19 429 213.01 3 14 254
21 431 203.38 2 4 273
22 432 190.56 2 8 248
25 435 196.59 3 9 226
26 436 193.31 3 10 249
27 437 207.89 3 7 272
29 439 202.98 2 10 260
30 440 177.28 2 4 291
31 441 182.04 1 6 282
32 442 217.50 2 3 265
33 443 190.43 2 11 248
35 445 197.24 2 4 256
37 447 197.16 3 5 240
42 452 183.07 3 5 293
43 453 197.99 2 6 293
44 454 208.27 2 6 254
45 455 187.61 3 12 271
46 456 173.18 2 6 280
47 457 187.89 2 6 235
48 458 191.96 1 7 286
49 459 196.39 1 4 275
50 460 178.51 2 13 262
52 462 204.17 1 6 253
53 463 203.77 2 11 273
54 464 206.25 1 13 249
55 465 211.63 2 13 222
56 466 211.34 1 6 228
57 467 194.34 2 1 217
58 468 201.53 2 12 247
59 469 198.01 2 3 251
60 470 188.94 2 7 290
61 471 190.49 3 2 220
62 472 197.34 2 3 224
63 473 194.04 1 15 229
64 474 202.74 2 1 287
67 477 189.98 1 6 300
69 479 206.37 3 2 293
70 480 183.81 2 10 274
72 482 190.70 2 12 265
74 484 194.25 3 2 262
75 485 191.15 3 10 297
76 486 193.23 3 15 255
77 487 193.29 2 4 266
78 488 182.20 1 15 260
81 491 195.89 2 12 294
82 492 200.77 1 8 278
83 493 179.12 2 7 281
85 495 172.14 3 13 252
86 496 183.82 1 4 264
88 498 195.32 1 6 249
89 499 197.19 1 13 274
90 500 178.07 1 8 293
92 502 209.65 2 7 241
95 505 199.66 3 5 220
96 506 190.96 2 11 259
98 508 206.58 3 3 230
100 510 196.60 2 5 231
103 513 193.25 2 15 280
104 514 181.34 2 3 227
I'm interested with the animals indexes and corresponding to them the dams' indexes. Using table function I was able to check that some dams are matched to different animals. In fact I got the following output
217 220 222 224 226 227 228 229 230 231 235 236 240 241 247 248 249 251 252 253 254 255 256 259 260 262
1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 3 3 2 1 1 2 1 1 1 2 3
264 265 266 268 271 272 273 274 275 278 280 281 282 286 287 290 291 293 294 295 297 300
1 2 1 1 1 1 2 2 1 1 2 2 1 1 1 2 1 4 2 2 1 1
Using length function I checked that there are only 48 dams in this dataset.
I would like to 'reindex' them with the integers 1, ..., 48 instead of these given in my set. Is there any method of doing such things?
You can use match and unique.
df$index <- match(df$dam, unique(df$dam))
Or convert to factor and then integer
df$index <- as.integer(factor(df$dam))
Another option is group_indices from dplyr.
df$index <- dplyr::group_indices(df, dam)
We can use .GRP in data.table
library(data.table)
setDT(df)[, index := .GRP, dam]

In R how to fill in numbers that do not appear in the sequence?

I have a data set that list the percentiles for a set of scores like this:
> percentiles
Score Percentile
1 231 0
2 385 1
3 403 2
4 413 3
5 418 4
6 424 5
7 429 6
8 434 7
9 437 8
10 441 9
11 443 10
I would like the "Score" column to run from 100 to 500. That is, I would like Scores 100 to 231 to be associated with a Percentile of 0, Scores 232 to 385 to be associated with a Percentile of 1, etc. Is there a simple way to fill in the values that do not appear in the sequence of "Score" values so it looks like the below data set?
> percentiles
Score Percentile
1 100 0
2 101 0
3 102 0
4 103 0
5 104 0
6 105 0
7 106 0
8 107 0
9 108 0
10 109 0
--------------------
130 229 0
131 230 0
132 231 0
133 232 1
134 233 1
135 234 1
136 235 1
137 236 1
138 237 1
139 238 1
140 239 1
If you convert percentiles to a data.table, you could do a rolling join with a new table of all scores 100:500. The rolling join with roll = -Inf gives a fill-backward behavior by itself, but still the 444:500 values are NA so a forward nafill is added at the end.
library(data.table)
setDT(percentiles)
percentiles[data.table(Score = 100:500), on = .(Score), roll = -Inf
][, Percentile := nafill(Percentile, 'locf')]
# Score Percentile
# 1: 100 0
# 2: 101 0
# 3: 102 0
# 4: 103 0
# 5: 104 0
# ---
# 397: 496 10
# 398: 497 10
# 399: 498 10
# 400: 499 10
# 401: 500 10
You might think about this differently: instead of a data frame to fill, as a set of breaks for binning your scores. Use the scores as the breaks with -Inf tacked on to have the lower bound. If you need something different to happen for the scores above the highest break, add Inf to the end of the breaks, but you'll need to come up with an additional label.
library(dplyr)
dat <- data.frame(Score = 100:500) %>%
mutate(Percentile = cut(Score, breaks = c(-Inf, percentiles$Score),
labels = percentiles$Percentile,
right = T, include.lowest = F))
Taking a look at a few of the breaking points:
slice(dat, c(129:135, 342:346))
#> Score Percentile
#> 1 228 0
#> 2 229 0
#> 3 230 0
#> 4 231 0
#> 5 232 1
#> 6 233 1
#> 7 234 1
#> 8 441 9
#> 9 442 10
#> 10 443 10
#> 11 444 <NA>
#> 12 445 <NA>
We could use complete
library(dplyr)
library(tidyr)
out <- complete(percentiles, Score = 100:500) %>%
fill(Percentile, .direction = "updown")
out %>%
slice(c(1:10, 130:140)) %>%
as.data.frame
# Score Percentile
#1 100 0
#2 101 0
#3 102 0
#4 103 0
#5 104 0
#6 105 0
#7 106 0
#8 107 0
#9 108 0
#10 109 0
#11 229 0
#12 230 0
#13 231 0
#14 232 1
#15 233 1
#16 234 1
#17 235 1
#18 236 1
#19 237 1
#20 238 1
#21 239 1
data
percentiles <- structure(list(Score = c(231L, 385L, 403L, 413L, 418L, 424L,
429L, 434L, 437L, 441L, 443L), Percentile = 0:10), class = "data.frame",
row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11"))
In base R you could use the findInterval function to break up your sequence 100:500 into buckets determined by the Score, then index into the Percentile column:
x <- 100:500
ind <- findInterval(x, percentiles$Score, left.open = TRUE)
output <- data.frame(Score = x, Percentile = percentiles$Percentile[ind + 1])
Values of x above 443 will receive a percentile of NA.
Here is a base R solution, where cut() and match() are the key points to make it, i.e.,
df <- data.frame(Score = (x <- 100:500),
percentile = percentiles$Percentile[match(s <-cut(x,c(0,percentiles$Score)),levels(s))])
such that
> df
Score percentile
1 100 0
2 101 0
3 102 0
4 103 0
5 104 0
6 105 0
7 106 0
8 107 0
9 108 0
10 109 0
11 110 0
12 111 0
13 112 0
14 113 0
15 114 0
16 115 0
17 116 0
18 117 0
19 118 0
20 119 0
21 120 0
22 121 0
23 122 0
24 123 0
25 124 0
26 125 0
27 126 0
28 127 0
29 128 0
30 129 0
31 130 0
32 131 0
33 132 0
34 133 0
35 134 0
36 135 0
37 136 0
38 137 0
39 138 0
40 139 0
41 140 0
42 141 0
43 142 0
44 143 0
45 144 0
46 145 0
47 146 0
48 147 0
49 148 0
50 149 0
51 150 0
52 151 0
53 152 0
54 153 0
55 154 0
56 155 0
57 156 0
58 157 0
59 158 0
60 159 0
61 160 0
62 161 0
63 162 0
64 163 0
65 164 0
66 165 0
67 166 0
68 167 0
69 168 0
70 169 0
71 170 0
72 171 0
73 172 0
74 173 0
75 174 0
76 175 0
77 176 0
78 177 0
79 178 0
80 179 0
81 180 0
82 181 0
83 182 0
84 183 0
85 184 0
86 185 0
87 186 0
88 187 0
89 188 0
90 189 0
91 190 0
92 191 0
93 192 0
94 193 0
95 194 0
96 195 0
97 196 0
98 197 0
99 198 0
100 199 0
101 200 0
102 201 0
103 202 0
104 203 0
105 204 0
106 205 0
107 206 0
108 207 0
109 208 0
110 209 0
111 210 0
112 211 0
113 212 0
114 213 0
115 214 0
116 215 0
117 216 0
118 217 0
119 218 0
120 219 0
121 220 0
122 221 0
123 222 0
124 223 0
125 224 0
126 225 0
127 226 0
128 227 0
129 228 0
130 229 0
131 230 0
132 231 0
133 232 1
134 233 1
135 234 1
136 235 1
137 236 1
138 237 1
139 238 1
140 239 1
141 240 1
142 241 1
143 242 1
144 243 1
145 244 1
146 245 1
147 246 1
148 247 1
149 248 1
150 249 1
151 250 1
152 251 1
153 252 1
154 253 1
155 254 1
156 255 1
157 256 1
158 257 1
159 258 1
160 259 1
161 260 1
162 261 1
163 262 1
164 263 1
165 264 1
166 265 1
167 266 1
168 267 1
169 268 1
170 269 1
171 270 1
172 271 1
173 272 1
174 273 1
175 274 1
176 275 1
177 276 1
178 277 1
179 278 1
180 279 1
181 280 1
182 281 1
183 282 1
184 283 1
185 284 1
186 285 1
187 286 1
188 287 1
189 288 1
190 289 1
191 290 1
192 291 1
193 292 1
194 293 1
195 294 1
196 295 1
197 296 1
198 297 1
199 298 1
200 299 1
201 300 1
202 301 1
203 302 1
204 303 1
205 304 1
206 305 1
207 306 1
208 307 1
209 308 1
210 309 1
211 310 1
212 311 1
213 312 1
214 313 1
215 314 1
216 315 1
217 316 1
218 317 1
219 318 1
220 319 1
221 320 1
222 321 1
223 322 1
224 323 1
225 324 1
226 325 1
227 326 1
228 327 1
229 328 1
230 329 1
231 330 1
232 331 1
233 332 1
234 333 1
235 334 1
236 335 1
237 336 1
238 337 1
239 338 1
240 339 1
241 340 1
242 341 1
243 342 1
244 343 1
245 344 1
246 345 1
247 346 1
248 347 1
249 348 1
250 349 1
251 350 1
252 351 1
253 352 1
254 353 1
255 354 1
256 355 1
257 356 1
258 357 1
259 358 1
260 359 1
261 360 1
262 361 1
263 362 1
264 363 1
265 364 1
266 365 1
267 366 1
268 367 1
269 368 1
270 369 1
271 370 1
272 371 1
273 372 1
274 373 1
275 374 1
276 375 1
277 376 1
278 377 1
279 378 1
280 379 1
281 380 1
282 381 1
283 382 1
284 383 1
285 384 1
286 385 1
287 386 2
288 387 2
289 388 2
290 389 2
291 390 2
292 391 2
293 392 2
294 393 2
295 394 2
296 395 2
297 396 2
298 397 2
299 398 2
300 399 2
301 400 2
302 401 2
303 402 2
304 403 2
305 404 3
306 405 3
307 406 3
308 407 3
309 408 3
310 409 3
311 410 3
312 411 3
313 412 3
314 413 3
315 414 4
316 415 4
317 416 4
318 417 4
319 418 4
320 419 5
321 420 5
322 421 5
323 422 5
324 423 5
325 424 5
326 425 6
327 426 6
328 427 6
329 428 6
330 429 6
331 430 7
332 431 7
333 432 7
334 433 7
335 434 7
336 435 8
337 436 8
338 437 8
339 438 9
340 439 9
341 440 9
342 441 9
343 442 10
344 443 10
345 444 NA
346 445 NA
347 446 NA
348 447 NA
349 448 NA
350 449 NA
351 450 NA
352 451 NA
353 452 NA
354 453 NA
355 454 NA
356 455 NA
357 456 NA
358 457 NA
359 458 NA
360 459 NA
361 460 NA
362 461 NA
363 462 NA
364 463 NA
365 464 NA
366 465 NA
367 466 NA
368 467 NA
369 468 NA
370 469 NA
371 470 NA
372 471 NA
373 472 NA
374 473 NA
375 474 NA
376 475 NA
377 476 NA
378 477 NA
379 478 NA
380 479 NA
381 480 NA
382 481 NA
383 482 NA
384 483 NA
385 484 NA
386 485 NA
387 486 NA
388 487 NA
389 488 NA
390 489 NA
391 490 NA
392 491 NA
393 492 NA
394 493 NA
395 494 NA
396 495 NA
397 496 NA
398 497 NA
399 498 NA
400 499 NA
401 500 NA
A bit hacky Base R:
# Create a dataframe with all score values in the range:
score_range <- merge(data.frame(Score = c(100:500)), percentiles, by = "Score", all.x = TRUE)
# Reverse the order of the dataframe:
score_range <- score_range[rev(order(score_range$Score)),]
# Change the first NA to the maximum score:
score_range$Percentile[which(is.na(score_range$Percentile))][1] <- max(score_range$Percentile, na.rm = TRUE)
# Replace all NAs with the value before them:
score_range$Percentile <- na.omit(score_range$Percentile)[cumsum(!is.na(score_range$Percentile))]
Data:
percentiles <- structure(list(Score = c(231L, 385L, 403L, 413L, 418L, 424L,
429L, 434L, 437L, 441L, 443L),
Percentile = 0:10), class = "data.frame",
row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11"))

Twin primes less than 1000 in r

I have the question: Construct a list of all twin primes less than 1000
So far my code is:
isPrime <- function (n ) n==2L || all (n %% 2L:max (2, floor(sqrt(n)))!=0)
Im having trouble constructing the actual list itself, any suggestions?
You could use the sapply command for getting your primes and then with the diff function the pairs
(Thanks Rui for pointing out that sapply is more suited than lapply here!)
testThese <- 1:1000
primes <- testThese[sapply(testThese,isPrime)]
pairs.temp <- which(diff(primes)==2)
pairs <- sort(c(pairs.temp, pairs.temp+1))
matrix(primes[pairs], ncol=2, byrow=TRUE)
[,1] [,2]
[1,] 3 5
[2,] 5 7
[3,] 11 13
[4,] 17 19
[5,] 29 31
... ... ...
Here is a solution using the Sieve of Eratosthenes:
E <- rep(TRUE, 1000)
E[1] <- FALSE
for (i in 2:33) {
if (!E[i]) next
E[seq(i+i, 1000, i)] <- FALSE
}
P <- which(E) ## primes
pp <- which(diff(P)==2) ## index of the first twin
cbind(P[pp], P[pp+1]) ## the twins
If you need a function isPrime() you can do:
isPrime <- function(i) E[i]
isPrime(c(1,2,4,5)) ## Test
Here is how you can construct (not very efficiently though) a list of primes using your function:
primes_list <- vector(length = 0, mode = "integer")
for (i in 1:1000) {
if (isPrime(i)) primes_list <- c(primes_list, i)
}
You should be able to extend that to sorting out the twin primes.
How about the following?
library(gmp)
library(dplyr)
df <- expand.grid(x = 1:1000)
df$y <- isprime(df$x)
df <- df[df$y == 2,]
df[c(0,diff(df$x)) == 2 | lead(c(0,diff(df$x)) == 2, 1, F),]
x y
3 3 2
5 5 2
7 7 2
11 11 2
13 13 2
17 17 2
19 19 2
29 29 2
31 31 2
41 41 2
43 43 2
59 59 2
61 61 2
71 71 2
73 73 2
101 101 2
103 103 2
107 107 2
109 109 2
137 137 2
139 139 2
149 149 2
151 151 2
179 179 2
181 181 2
191 191 2
193 193 2
197 197 2
199 199 2
227 227 2
229 229 2
239 239 2
241 241 2
269 269 2
271 271 2
281 281 2
283 283 2
311 311 2
313 313 2
347 347 2
349 349 2
419 419 2
421 421 2
431 431 2
433 433 2
461 461 2
463 463 2
521 521 2
523 523 2
569 569 2
571 571 2
599 599 2
601 601 2
617 617 2
619 619 2
641 641 2
643 643 2
659 659 2
661 661 2
809 809 2
811 811 2
821 821 2
823 823 2
827 827 2
829 829 2
857 857 2
859 859 2
881 881 2
883 883 2

Function or other basic script that compares values on two variables in a dataframe using an id variable located in both

Let's say you have two data frames, both of which contain some, but not all of the same records. Where they are the same records, the id variable in both data frames matches. There is a particular variable in each data frame that needs to be checked for consistency across the data frames, and any discrepancies need to be printed:
d1 <- ## first dataframe
d2 <- ## second dataframe
colnames(d1) #column headings for dataframe 1
[1] "id" "variable1" "variable2" "variable3"
colnames(d2) #column headings for dataframe 2 are identical
[1] "id" "variable1" "variable2" "variable3"
length(d1$id) #there are 200 records in dataframe 1
[1] 200
length(d2$id) #there are not the same number in dataframe 2
[1] 150
##Some function that takes d1$id, matches with d2$id, then compares the values of the matched, returning any discrepancies
I constructed an elaborate loop for this, but feel as though this is not the right way of going about it. Surely there is some better way than this for-if-for-if-if statement.
for (i in seq(d1$id)){ ##Sets up counter for loop
if (d1$id[i] %in% d2$id){ ## Search, compares and saves a common id and variable
index <- d1$id[i];
variable_d1 <- d1$variable1[i];
for (p in seq(d2$id)){ set
if (d2$id[p] == index){ ## saves the corresponding value in the second dataframe
variable_d2 <- d2$variable1[p];
if (variable_d2 != variable_d1) { ## prints if they are not equal
print(index);
}
}
}
}
}
Here's a solution, using random input data with a 50% chance that a given cell will be discrepant between d1 and d2:
set.seed(1);
d1 <- data.frame(id=sample(300,200),variable1=sample(2,200,replace=T),variable2=sample(2,200,replace=T),variable3=sample(2,200,replace=T));
d2 <- data.frame(id=sample(300,150),variable1=sample(2,150,replace=T),variable2=sample(2,150,replace=T),variable3=sample(2,150,replace=T));
head(d1);
## id variable1 variable2 variable3
## 1 80 1 2 2
## 2 112 1 1 2
## 3 171 2 2 1
## 4 270 1 2 2
## 5 60 1 2 2
## 6 266 2 2 2
head(d2);
## id variable1 variable2 variable3
## 1 258 1 2 1
## 2 11 1 1 1
## 3 290 2 1 2
## 4 222 2 1 2
## 5 81 2 1 1
## 6 200 1 2 1
com <- intersect(d1$id,d2$id); ## derive common id values
d1com <- match(com,d1$id); ## find indexes of d1 that correspond to common id values, in order of com
d2com <- match(com,d2$id); ## find indexes of d2 that correspond to common id values, in order of com
v1diff <- com[d1$variable1[d1com]!=d2$variable1[d2com]]; ## get ids of variable1 discrepancies
v1diff;
## [1] 60 278 18 219 290 35 107 4 237 131 50 210 29 168 6 174 61 127 99 220 247 244 157 51 84 122 196 125 265 115 186 139 3 132 223 211 268 102 155 207 238 41 199 200 231 236 172 275 250 176 248 255 222 59 100 33 124
v2diff <- com[d1$variable2[d1com]!=d2$variable2[d2com]]; ## get ids of variable2 discrepancies
v2diff;
## [1] 112 60 18 198 219 290 131 50 210 29 168 258 215 291 127 161 99 220 110 293 87 164 84 122 196 125 186 139 81 132 82 89 223 268 98 14 155 241 207 231 172 62 275 176 248 255 59 298 100 12 156
v3diff <- com[d1$variable3[d1com]!=d2$variable3[d2com]]; ## get ids of variable3 discrepancies
v3diff;
## [1] 278 219 290 35 4 237 131 168 202 174 215 220 247 244 261 293 164 13 294 84 196 125 265 115 186 81 3 89 223 211 268 98 14 155 241 207 38 191 200 276 250 45 269 255 298 100 12 156 124
Here's a proof that all variable1 values for ids in v1diff are really discrepant between d1 and d2:
d1$variable1[match(v1diff,d1$id)]; d2$variable1[match(v1diff,d2$id)];
## [1] 1 2 2 1 1 2 2 1 1 1 2 2 2 2 1 2 2 1 2 2 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 2 2 2 1 2 2 1 1 2 1 1 2 1 2 1 2 2 1 2 2 1 1
## [1] 2 1 1 2 2 1 1 2 2 2 1 1 1 1 2 1 1 2 1 1 2 2 1 2 2 1 2 2 2 2 2 2 2 2 2 1 1 1 2 1 1 2 2 1 2 2 1 2 1 2 1 1 2 1 1 2 2
Here's a proof that all variable1 values for ids not in v1diff are not discrepant between d1 and d2:
with(subset(d1,id%in%com&!id%in%v1diff),variable1[order(id)]); with(subset(d2,id%in%com&!id%in%v1diff),variable1[order(id)]);
## [1] 1 1 2 1 1 1 2 2 1 2 2 1 2 2 1 1 2 1 2 1 2 1 1 1 1 1 1 2 2 2 2 1 1 1 2 2 2 1 1 1 1
## [1] 1 1 2 1 1 1 2 2 1 2 2 1 2 2 1 1 2 1 2 1 2 1 1 1 1 1 1 2 2 2 2 1 1 1 2 2 2 1 1 1 1
Here, I wrapped this solution in a function which returns the vectors of discrepant id values in a list, with each component named for the variable it represents:
compare <- function(d1,d2,cols=setdiff(intersect(colnames(d1),colnames(d2)),'id')) {
com <- intersect(d1$id,d2$id);
d1com <- match(com,d1$id);
d2com <- match(com,d2$id);
setNames(lapply(cols,function(col) com[d1[[col]][d1com]!=d2[[col]][d2com]]),cols);
};
compare(d1,d2);
## $variable1
## [1] 60 278 18 219 290 35 107 4 237 131 50 210 29 168 6 174 61 127 99 220 247 244 157 51 84 122 196 125 265 115 186 139 3 132 223 211 268 102 155 207 238 41 199 200 231 236 172 275 250 176 248 255 222 59 100 33 124
##
## $variable2
## [1] 112 60 18 198 219 290 131 50 210 29 168 258 215 291 127 161 99 220 110 293 87 164 84 122 196 125 186 139 81 132 82 89 223 268 98 14 155 241 207 231 172 62 275 176 248 255 59 298 100 12 156
##
## $variable3
## [1] 278 219 290 35 4 237 131 168 202 174 215 220 247 244 261 293 164 13 294 84 196 125 265 115 186 81 3 89 223 211 268 98 14 155 241 207 38 191 200 276 250 45 269 255 298 100 12 156 124
Here is an approach using merge.
First, merge the dataframes, keeping all columns.
x <- merge(d1, d1, by="id")
Then, find all rows which do not match:
x[x$variable1.x != x$variable1.y | x$variable2.x != x$variable2.y |
x$variable3.x != x$variable3.y, ]

Resources