R programming - data frame manoevur - r

Suppose I have the following dataframe:
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 50 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
5: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
6: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 0 48
4: 2 3 TRUE 1 2010 0 50
5: 2 3 TRUE 1 2010 0 52
6: 3 3 FALSE 1 2010 0 57
I'd like to turn it into a new dataframe like the following:
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 0 (sum of nF for 48 and 50, factdcx) 48
4: 2 3 TRUE 1 2010 0 52
5: 3 3 FALSE 1 2010 0 57
How can I do it? (Surely, the dataframe, abc, is much larger, but I want the sum of all categories of 48 and 50 and group it into a new category, say '48').
Many thanks!
> dput(head(abc1))
structure(list(dc = c(24L, 41L, 48L, 50L, 52L, 57L), tmin = c(-1L,
-3L, 0L, 0L, 3L, -2L), tmax = c(4L, 5L, 5L, 5L, 5L, 5L), cint = c(5L,
8L, 5L, 5L, 2L, 7L), wcmin = c(-5L, -8L, -4L, -4L, -3L, -6L),
wcmax = c(-2L, -3L, 0L, 0L, 1L, -1L), wsmin = c(20L, 15L,
30L, 30L, 20L, 25L), wsmax = c(25L, 20L, 35L, 35L, 25L, 30L
), gsmin = c(35L, 35L, 45L, 45L, 35L, 35L), gsmax = c(40L,
40L, 50L, 50L, 40L, 40L), wd = c(90L, 90L, 45L, 45L, 45L,
315L), rmin = c(11.8, 10, 7.3, 7.3, 6.7, 4.4), rmax = c(26.6,
23.5, 19, 19, 17.4, 13.8), cir = c(14.8, 13.5, 11.7, 11.7,
10.7, 9.4), lr = c(3L, 3L, 6L, 6L, 6L, 7L), lc = c(1L, 1L,
2L, 2L, 2L, 3L), wc = c(3L, 3L, 3L, 3L, 3L, 3L), li = c(TRUE,
TRUE, TRUE, TRUE, TRUE, FALSE), yd = c(1L, 1L, 1L, 1L, 1L,
1L), yr = c(2010L, 2010L, 2010L, 2010L, 2010L, 2010L), nF = c(2L,
8L, 0L, 0L, 0L, 0L), factdcx = structure(1:6, .Label = c("24",
"41", "48", "50", "52", "57", "70"), class = "factor")), .Names = c("dc",
"tmin", "tmax", "cint", "wcmin", "wcmax", "wsmin", "wsmax", "gsmin",
"gsmax", "wd", "rmin", "rmax", "cir", "lr", "lc", "wc", "li",
"yd", "yr", "nF", "factdcx"), class = c("data.table", "data.frame"
), row.names = c(NA, -6L), .internal.selfref = <pointer: 0x054b24a0>)
Still got a problem, sir/madam:
> head(abc1 (updated))
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
6: 70 -2 3 5 -4 -1 20 25 30 35 360 3.6 10.2 6.6 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 57 48
4: 2 3 TRUE 1 2010 0 52
5: 3 3 FALSE 1 2010 0 57
6: 3 2 TRUE 1 2010 1 70
The sum of nF was incorrect, it should be zero.

Try
library(data.table)
unique(setDT(df1)[, factdcx:= as.character(factdcx)][factdcx %chin%
c('48','50'), c('dc', 'factdcx', 'nF') := list('48', '48', sum(nF))])
# dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
#1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
#2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
#3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
#4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
#5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
# lc wc li yd yr nF factdcx
#1: 1 3 TRUE 1 2010 2 24
#2: 1 3 TRUE 1 2010 8 41
#3: 2 3 TRUE 1 2010 0 48
#4: 2 3 TRUE 1 2010 0 52
#5: 3 3 FALSE 1 2010 0 57
For abc1,
res1 <- unique(setDT(abc1)[, factdcx:= as.character(factdcx)][factdcx %chin%
c('48','50'), c('dc', 'factdcx', 'nF') := list(48, '48', sum(nF))])
res1
# dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
#1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
#2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
#3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
#4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
#5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
# lc wc li yd yr nF factdcx
#1: 1 3 TRUE 1 2010 2 24
#2: 1 3 TRUE 1 2010 8 41
#3: 2 3 TRUE 1 2010 0 48
#4: 2 3 TRUE 1 2010 0 52
#5: 3 3 FALSE 1 2010 0 57
data
df1 <- structure(list(dc = structure(1:6, .Label = c("24", "41",
"48",
"50", "52", "57"), class = "factor"), tmin = c(-1L, -3L, 0L,
0L, 3L, -2L), tmax = c(4L, 5L, 5L, 5L, 5L, 5L), cint = c(5L,
8L, 5L, 5L, 2L, 7L), wcmin = c(-5L, -8L, -4L, -4L, -3L, -6L),
wcmax = c(-2L, -3L, 0L, 0L, 1L, -1L), wsmin = c(20L, 15L,
30L, 30L, 20L, 25L), wsmax = c(25L, 20L, 35L, 35L, 25L, 30L
), gsmin = c(35L, 35L, 45L, 45L, 35L, 35L), gsmax = c(40L,
40L, 50L, 50L, 40L, 40L), wd = c(90L, 90L, 45L, 45L, 45L,
315L), rmin = c(11.8, 10, 7.3, 7.3, 6.7, 4.4), rmax = c(26.6,
23.5, 19, 19, 17.4, 13.8), cir = c(14.8, 13.5, 11.7, 11.7,
10.7, 9.4), lr = c(3L, 3L, 6L, 6L, 6L, 7L), lc = c(1L, 1L,
2L, 2L, 2L, 3L), wc = c(3L, 3L, 3L, 3L, 3L, 3L), li = c(TRUE,
TRUE, TRUE, TRUE, TRUE, FALSE), yd = c(1L, 1L, 1L, 1L, 1L,
1L), yr = c(2010L, 2010L, 2010L, 2010L, 2010L, 2010L), nF = c(2L,
8L, 0L, 0L, 0L, 0L), factdcx = structure(1:6, .Label = c("24",
"41", "48", "50", "52", "57"), class = "factor")), .Names = c("dc",
"tmin", "tmax", "cint", "wcmin", "wcmax", "wsmin", "wsmax", "gsmin",
"gsmax", "wd", "rmin", "rmax", "cir", "lr", "lc", "wc", "li",
"yd", "yr", "nF", "factdcx"), row.names = c("1:", "2:", "3:",
"4:", "5:", "6:"), class = "data.frame")

Related

How to subtract the specified percentage between observations to perform complex arithmetic operations in R

I have 2 datasets
d1=structure(list(mdm = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L), perc = c(50L, 60L, 70L, 80L, 85L, 90L, 95L,
96L, 97L, 98L, 99L, 100L, 101L, 102L, 103L, 104L, 105L, 110L,
115L, 120L, 130L, 140L, 150L, 50L, 60L, 70L, 80L, 85L, 90L, 95L,
96L, 97L, 98L, 99L, 100L, 101L, 102L, 103L, 104L, 105L, 110L,
115L, 120L, 130L, 140L, 150L), price = c(38.9, 46.68, 54.46,
62.24, 66.13, 70.02, 73.91, 74.688, 75.466, 76.244, 77.022, 77.8,
78.578, 79.356, 80.134, 80.912, 81.69, 85.58, 89.47, 93.36, 101.14,
108.92, 116.7, 38.9, 46.68, 54.46, 62.24, 66.13, 70.02, 73.91,
74.688, 75.466, 76.244, 77.022, 77.8, 78.578, 79.356, 80.134,
80.912, 81.69, 85.58, 89.47, 93.36, 101.14, 108.92, 116.7), count = c(175,
160, 145, 130, 122.5, 115, 107.5, 106, 104.5, 103, 101.5, 100,
98.5, 97, 95.5, 94, 92.5, 85, 77.5, 70, 55, 40, 25, 175, 160,
145, 130, 122.5, 115, 107.5, 106, 104.5, 103, 101.5, 100, 98.5,
97, 95.5, 94, 92.5, 85, 77.5, 70, 55, 40, 25), profit = c(2607.5,
3628.8, 4416.7, 4971.2, 5160.925, 5292.3, 5365.325, 5372.928,
5378.197, 5381.132, 5381.733, 5380, 5375.933, 5369.532, 5360.797,
5349.728, 5336.325, 5234.3, 5073.925, 4855.2, 4242.7, 3396.8,
2317.5, 2432.5, 3468.8, 4271.7, 4841.2, 5038.425, 5177.3, 5257.825,
5266.928, 5273.697, 5278.132, 5280.233, 5280, 5277.433, 5272.532,
5265.297, 5255.728, 5243.825, 5149.3, 4996.425, 4785.2, 4187.7,
3356.8, 2292.5)), class = "data.frame", row.names = c(NA, -46L
))
and second dataset represents the percentage values by which it is necessary to reduce or increase the value of price and count in d1 also d2 contains the cost column
d2=structure(list(mdm = 7:8, elast = c(1.5, 1.5), cost = 24:25), class = "data.frame", row.names = c(NA,
-2L))
I'm having some troubles with complex arithmetic and I need help.
I'll try to describe my question in more detail.
I need for each mdm group to subtract the percentage indicated in perc column from the price value, where the perc column = 100. (100 is always the start value.)
For example for mdm=7, perc=100 where price=77.8.
The next perc value is 99, i.e. less by 1, so subtract 1 percent from 77.8 and get 77.022. perc = 85, this means that from the starting point 77.8 we subtract 15% = 66.13, perc = 50, which means we subtract 50 percent from the starting point.
In a similar way, I need to add percentages up, for example 101, this means that 1 percent up from the price = 77.8 i.e. 78,578, thus forming the price column and so on.
Further, the price value with perc = 100 has a value in the count column, in this example it is also = 100 (but this is not always the case).
I need to take the value from the elast column for each mdm group from d2 dataset and this value is multiplied by the next percentage of 100. For example, where perc = 99 for mdm = 7, the value of 1.5 must be multiplied by 1 (100*1,5=101.5), where the value of perc = 70, then 30 * 1.5 (100-70=30) 30*1,5=45 100+45=145 and so on.
The last step for each mdm in data d2 there is the cost price. This means that from the already formed price column, we must subtract the cost value, for example, for mdm = 7, cost=24 from the price (38.9-24 = 14.9), this value is multiplied by the value in the count column, i.e. in this case 175. This action creates a new column profit=14.9*175=2607
In this reproducible example, the price column is all filled in by me for a sample. In raw data this table looks like this (indeed desired output in d1 dataset)
The initial data looks like this
mdm perc price count
1 7 50 NA NA
2 7 60 NA NA
3 7 70 NA NA
4 7 80 NA NA
5 7 85 NA NA
6 7 90 NA NA
7 7 95 NA NA
8 7 96 NA NA
9 7 97 NA NA
10 7 98 NA NA
11 7 99 NA NA
**12 7 100 77.8 100**
13 7 101 NA NA
14 7 102 NA NA
15 7 103 NA NA
16 7 104 NA NA
17 7 105 NA NA
18 7 110 NA NA
19 7 115 NA NA
20 7 120 NA NA
21 7 130 NA NA
22 7 140 NA NA
23 7 150 NA NA
24 8 50 NA NA
25 8 60 NA NA
26 8 70 NA NA
27 8 80 NA NA
28 8 85 NA NA
29 8 90 NA NA
30 8 95 NA NA
31 8 96 NA NA
32 8 97 NA NA
33 8 98 NA NA
34 8 99 NA NA
**35 8 100 77.8 100**
36 8 101 NA NA
37 8 102 NA NA
38 8 103 NA NA
39 8 104 NA NA
40 8 105 NA NA
41 8 110 NA NA
42 8 115 NA NA
43 8 120 NA NA
44 8 130 NA NA
45 8 140 NA NA
46 8 150 NA NA
Thanks for your any valuable help.
Using data.table
library(data.table)
setDT(d1)[d2, c("price", "count", "cost") :=
.((price[perc == 100]/100)*perc, count[perc == 100] +
(elast* count[perc == 100]-perc), i.cost), on = .(mdm)]
d1[, last_step := (price - cost) * count]
-output
> head(d1)
mdm perc price count profit cost last_step
1: 7 50 38.90 200 2607.500 24 2980.00
2: 7 60 46.68 190 3628.800 24 4309.20
3: 7 70 54.46 180 4416.700 24 5482.80
4: 7 80 62.24 170 4971.200 24 6500.80
5: 7 85 66.13 165 5160.925 24 6951.45
6: 7 90 70.02 160 5292.300 24 7363.20
Here is one way:
First we join both dataframes,
then we define the rules as you describe in detail (therefore it is easy to translate to code :-).
I think most challenging and tricky thinking is to fix the price value at 100% -> in this case price[perc=100]. The rest is described by your fantastic explanation:
library(dplyr)
df %>%
left_join(d2, by="mdm") %>%
group_by(mdm) %>%
mutate(price = (price[perc==100]/100)*perc,
count = (count[perc==100]+(elast* count[perc==100]-perc)),
last_step = (price-cost)*count)
mdm perc price count elast cost last_step
<int> <int> <dbl> <dbl> <dbl> <int> <dbl>
1 7 50 38.9 175 1.5 24 2607.
2 7 60 46.7 160 1.5 24 3629.
3 7 70 54.5 145 1.5 24 4417.
4 7 80 62.2 130 1.5 24 4971.
5 7 85 66.1 122. 1.5 24 5161.
6 7 90 70.0 115 1.5 24 5292.
7 7 95 73.9 108. 1.5 24 5365.
8 7 96 74.7 106 1.5 24 5373.
9 7 97 75.5 104. 1.5 24 5378.
10 7 98 76.2 103 1.5 24 5381.
# … with 36 more rows
# ℹ Use `print(n = ...)` to see more rows
You should be able to produce d1 from the original frame as follows:
d1 %>%
group_by(mdm) %>%
mutate(price = price[perc==100]*(1-(100-perc)/100)) %>%
ungroup %>%
inner_join(d2, by="mdm") %>%
mutate(count = count[perc==100] + (100-perc)*elast, profit = count*(price-cost)) %>%
select(-c(elast,cost))
Input:
d1 = structure(list(mdm = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L), perc = c(50L, 60L, 70L, 80L, 85L, 90L, 95L,
96L, 97L, 98L, 99L, 100L, 101L, 102L, 103L, 104L, 105L, 110L,
115L, 120L, 130L, 140L, 150L, 50L, 60L, 70L, 80L, 85L, 90L, 95L,
96L, 97L, 98L, 99L, 100L, 101L, 102L, 103L, 104L, 105L, 110L,
115L, 120L, 130L, 140L, 150L), price = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 77.8, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 77.8,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), count = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 100, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 100, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-46L))
Output:
mdm perc price count profit
1 7 50 38.900 175.0 2607.500
2 7 60 46.680 160.0 3628.800
3 7 70 54.460 145.0 4416.700
4 7 80 62.240 130.0 4971.200
5 7 85 66.130 122.5 5160.925
6 7 90 70.020 115.0 5292.300
7 7 95 73.910 107.5 5365.325
8 7 96 74.688 106.0 5372.928
9 7 97 75.466 104.5 5378.197
10 7 98 76.244 103.0 5381.132
11 7 99 77.022 101.5 5381.733
12 7 100 77.800 100.0 5380.000
13 7 101 78.578 98.5 5375.933
14 7 102 79.356 97.0 5369.532
15 7 103 80.134 95.5 5360.797
16 7 104 80.912 94.0 5349.728
17 7 105 81.690 92.5 5336.325
18 7 110 85.580 85.0 5234.300
19 7 115 89.470 77.5 5073.925
20 7 120 93.360 70.0 4855.200
21 7 130 101.140 55.0 4242.700
22 7 140 108.920 40.0 3396.800
23 7 150 116.700 25.0 2317.500
24 8 50 38.900 175.0 2432.500
25 8 60 46.680 160.0 3468.800
26 8 70 54.460 145.0 4271.700
27 8 80 62.240 130.0 4841.200
28 8 85 66.130 122.5 5038.425
29 8 90 70.020 115.0 5177.300
30 8 95 73.910 107.5 5257.825
31 8 96 74.688 106.0 5266.928
32 8 97 75.466 104.5 5273.697
33 8 98 76.244 103.0 5278.132
34 8 99 77.022 101.5 5280.233
35 8 100 77.800 100.0 5280.000
36 8 101 78.578 98.5 5277.433
37 8 102 79.356 97.0 5272.532
38 8 103 80.134 95.5 5265.297
39 8 104 80.912 94.0 5255.728
40 8 105 81.690 92.5 5243.825
41 8 110 85.580 85.0 5149.300
42 8 115 89.470 77.5 4996.425
43 8 120 93.360 70.0 4785.200
44 8 130 101.140 55.0 4187.700
45 8 140 108.920 40.0 3356.800
46 8 150 116.700 25.0 2292.500

data.table efficiently finding common pairs between 2 columns

say I have a dataframe
subject stim1 stim2 feedback
1 1003 50 51 1
2 1003 48 50 1
3 1003 49 51 1
4 1003 47 49 1
5 1003 47 46 1
6 1003 46 48 1
10 1003 50 48 1
428 1003 48 51 0
433 1003 46 50 0
434 1003 50 49 0
435 1003 54 59 0
I want to create a new column "transitive_pair" by
group by subject (column 1),
For every row in which feedback==0 (starting index 428, otherwise transitive_pair=NaN).
I want to return a boolean which tells me whether there is any chain of pairings (but only those in which feedback==1) that would transitively link stim1 and stim2 values.
Working out a few examples.
row 428- stim1=48 and stim2=51
48 and 51 are not paired but 51 was paired with 50 (e.g.row 1 ) and 50 was paired with 48 (row 10) so transitive_pair[428]=True
row 433- stim 1=46 and stim2=50
46 and 48 were paired (row 6) and 48 was paired with 50 (row 2) so transitive_pair[433]=True
in row 435, stim1=54, stim2=59
there is no chain of pairs that could link them (59 is not paired with anything while feedback==1) so transitive_pair[435]=False
desired output
subject stim1 stim2 feedback transitive_pair
1 1003 50 51 1 NaN
2 1003 48 50 1 NaN
3 1003 49 51 1 NaN
4 1003 47 49 1 NaN
5 1003 47 46 1 NaN
6 1003 46 48 1 NaN
10 1003 50 48 1 NaN
428 1003 48 51 0 1
433 1003 46 50 0 1
434 1003 50 49 0 1
435 1003 54 59 0 0
any help would be greatly appreciated!!
and putting a recreateble df here
structure(list(subject = c(1003L, 1003L, 1003L, 1003L, 1003L,
1003L, 1003L, 1003L, 1003L, 1003L, 1003L), stim1 = c(50L, 48L,
49L, 47L, 47L, 46L, 50L, 48L, 46L, 50L, 54L), stim2 = c(51L,
50L, 51L, 49L, 46L, 48L, 48L, 51L, 50L, 49L, 59L), feedback = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L), transitive_pair = c(NaN,
NaN, NaN, NaN, NaN, NaN, NaN, 1, 1, 1, 0)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 10L, 428L, 433L, 434L, 435L), class = "data.frame")
The columns "stim1" and "stim2" define an undirected graph. Create the graph for feedback == 1, get its connected components and for each row of the data.frame, check if the values of "stim1" and "stim2" belong to the same component. In the end assign NaN to the rows where feedback is 1.
suppressPackageStartupMessages(library(igraph))
inx <- df1$feedback == 1
g <- graph_from_data_frame(df1[inx, c("stim1", "stim2")], directed = FALSE)
plot(g)
g_comp <- components(g)$membership
df1$transitive_pair_2 <- apply(df1[c("stim1", "stim2")], 1, \(x) {
i <- names(g_comp) == x[1]
j <- names(g_comp) == x[2]
if(any(i) & any(j))
g_comp[i] == g_comp[j]
else 0L
})
df1$transitive_pair_2[inx] <- NaN
df1
#> subject stim1 stim2 feedback transitive_pair transitive_pair_2
#> 1 1003 50 51 1 NaN NaN
#> 2 1003 48 50 1 NaN NaN
#> 3 1003 49 51 1 NaN NaN
#> 4 1003 47 49 1 NaN NaN
#> 5 1003 47 46 1 NaN NaN
#> 6 1003 46 48 1 NaN NaN
#> 10 1003 50 48 1 NaN NaN
#> 428 1003 48 51 0 1 1
#> 433 1003 46 50 0 1 1
#> 434 1003 50 49 0 1 1
#> 435 1003 54 59 0 0 0
Created on 2022-07-31 by the reprex package (v2.0.1)

filling in missing data using fitted value in R

I have a dataframe like this:
ID year age wage
1 2 1981 22 10000
2 2 1982 23 11000
3 2 1983 24 11500
4 2 1984 25 11000
5 2 1985 26 14000
6 2 1986 27 16000
7 2 1987 28 20000
8 2 1988 29 19000
9 2 1989 30 20000
10 2 1990 31 20000
11 2 1991 32 22000
12 2 1992 33 25000
13 2 1993 34 0
14 2 1994 35 NA
15 2 1995 36 0
16 2 1996 37 NA
17 2 1997 38 0
18 2 1998 39 NA
19 2 1999 40 0
20 2 2000 41 NA
21 2 2001 42 0
22 2 2002 43 NA
23 2 2003 44 0
24 2 2004 45 NA
25 2 2005 46 5500
26 2 2006 47 NA
27 2 2007 48 5000
28 2 2008 49 NA
29 2 2009 50 6000
30 2 2010 51 NA
31 2 2011 52 19000
32 2 2012 53 NA
33 2 2013 54 21000
34 2 2014 55 NA
35 2 2015 56 23000
36 3 1984 22 1300
37 3 1985 23 0
38 3 1986 24 1500
39 3 1987 25 1000
40 3 1988 26 0
I want to use an individual-specific regression of wage on age and age-squared to impute missing wage observations. I want to only impute when at least 5 non-missing observations are available.
As suggested by jay.sf, I tried the following but with fitted values:
df_imp <- do.call(rbind,
by(df, df$ID, function(x) {
IDs <- which(is.na(x$wage))
if (length(x$wage[- IDs]) >= 5) {
b <- lm(wage ~ poly(age, 2, raw=TRUE), x)$fitted.values
x$wage[IDs] <- with(x, b)[IDs]
}
return(x)
}))
I got the following results:
ID year age wage
36 2 1981 22 10000.000
37 2 1982 23 11000.000
38 2 1983 24 11500.000
39 2 1984 25 11000.000
40 2 1985 26 14000.000
41 2 1986 27 16000.000
42 2 1987 28 20000.000
43 2 1988 29 19000.000
44 2 1989 30 20000.000
45 2 1990 31 20000.000
46 2 1991 32 22000.000
47 2 1992 33 25000.000
48 2 1993 34 0.000
49 2 1994 35 7291.777
50 2 1995 36 0.000
51 2 1996 37 6779.133
52 2 1997 38 0.000
53 2 1998 39 7591.597
54 2 1999 40 0.000
55 2 2000 41 9729.168
56 2 2001 42 0.000
57 2 2002 43 13191.847
58 2 2003 44 0.000
59 2 2004 45 17979.633
60 2 2005 46 5500.000
61 2 2006 47 NA
62 2 2007 48 5000.000
63 2 2008 49 NA
64 2 2009 50 6000.000
65 2 2010 51 NA
66 2 2011 52 19000.000
67 2 2012 53 NA
68 2 2013 54 21000.000
69 2 2014 55 NA
70 2 2015 56 23000.000
You could use a simple if statement, without an else. Define an ID vector IDs that identifies missings, which you use to count them and to subset your Y column wage.
For this you can use by(), which splits your data similar to split() but you may apply a function; just rbind the result.
It's probably wiser to rather use the coefficients than the fitted values, because the latter also would be NA if your Y are NA. And you need to use raw=TRUE in the poly.
DF.imp <- do.call(rbind,
by(DF, DF$ID, function(x) {
IDs <- which(is.na(x$wage))
if (length(x$wage[- IDs]) >= 5) {
b <- lm(wage ~ poly(age, 2, raw=TRUE), x)$coefficients
x$wage[IDs] <- with(x, (b[1] + b[2]*age + b[3]*age^2))[IDs]
}
return(x)
}))
Note that I've slightly changed your example data, so that ID 3 also has missings, but less than 5 non-missings.
Result
DF.imp
# ID year age wage
# 2.1 2 1981 22 10000.000
# 2.2 2 1982 23 11000.000
# 2.3 2 1983 24 11500.000
# 2.4 2 1984 25 11000.000
# 2.5 2 1985 26 14000.000
# 2.6 2 1986 27 16000.000
# 2.7 2 1987 28 20000.000
# 2.8 2 1988 29 19000.000
# 2.9 2 1989 30 20000.000
# 2.10 2 1990 31 20000.000
# 2.11 2 1991 32 22000.000
# 2.12 2 1992 33 25000.000
# 2.13 2 1993 34 0.000
# 2.14 2 1994 35 7626.986
# 2.15 2 1995 36 0.000
# 2.16 2 1996 37 7039.387
# 2.17 2 1997 38 0.000
# 2.18 2 1998 39 6783.065
# 2.19 2 1999 40 0.000
# 2.20 2 2000 41 6858.020
# 2.21 2 2001 42 0.000
# 2.22 2 2002 43 7264.252
# 2.23 2 2003 44 0.000
# 2.24 2 2004 45 8001.761
# 2.25 2 2005 46 5500.000
# 2.26 2 2006 47 9070.546
# 2.27 2 2007 48 5000.000
# 2.28 2 2008 49 10470.609
# 2.29 2 2009 50 6000.000
# 2.30 2 2010 51 12201.948
# 2.31 2 2011 52 19000.000
# 2.32 2 2012 53 14264.565
# 2.33 2 2013 54 21000.000
# 2.34 2 2014 55 16658.458
# 2.35 2 2015 56 23000.000
# 3.36 3 1984 22 1300.000
# 3.37 3 1985 23 NA
# 3.38 3 1986 24 1500.000
# 3.39 3 1987 25 1000.000
# 3.40 3 1988 26 NA
Data
DF <- structure(list(ID = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), year = c(1981L,
1982L, 1983L, 1984L, 1985L, 1986L, 1987L, 1988L, 1989L, 1990L,
1991L, 1992L, 1993L, 1994L, 1995L, 1996L, 1997L, 1998L, 1999L,
2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L,
2009L, 2010L, 2011L, 2012L, 2013L, 2014L, 2015L, 1984L, 1985L,
1986L, 1987L, 1988L), age = c(22L, 23L, 24L, 25L, 26L, 27L, 28L,
29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L,
42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L,
55L, 56L, 22L, 23L, 24L, 25L, 26L), wage = c(10000L, 11000L,
11500L, 11000L, 14000L, 16000L, 20000L, 19000L, 20000L, 20000L,
22000L, 25000L, 0L, NA, 0L, NA, 0L, NA, 0L, NA, 0L, NA, 0L, NA,
5500L, NA, 5000L, NA, 6000L, NA, 19000L, NA, 21000L, NA, 23000L,
1300L, NA, 1500L, 1000L, NA)), row.names = c(NA, -40L), class = "data.frame")

Using dplyr to fill in missing values (through a join?)

I have a data frame (df1) that has some missing values (city, state):
SiteID City StateBasedIn Lat Lon Var1 Var2
4227 Richmond KY -39 -113 6 0
4987 Nashville TN -33 -97 7 0
4000 Newark NJ -39 -95 8 0
4925 Miami FL -40 -99 0 0
4437 Montgomery AL -32 -117 4 1
4053 Jonesboro AR -30 -98 8 1
df1 <- structure(list(SiteID = c(4227L, 4987L, 4000L, 4925L, 4437L,
4053L, 4482L, 4037L, 4020L, 1787L, 2805L, 3025L, 3027L, 3028L,
3029L, 3030L, 3031L, 3033L), City = structure(c(10L, 7L, 8L,
5L, 6L, 4L, 2L, 9L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"Arcata", "Jackson", "Jonesboro", "Miami", "Montgomery", "Nashville",
"Newark", "Portland", "Richmond"), class = "factor"), StateBasedIn = structure(c(6L,
10L, 8L, 5L, 2L, 3L, 4L, 9L, 7L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("", "AL", "AR", "CA", "FL", "KY", "MS", "NJ",
"OR", "TN"), class = "factor"), Lat = c(-39L, -33L, -39L, -40L,
-32L, -30L, -38L, -31L, -35L, -38L, -30L, -39L, -38L, -32L, -39L,
-31L, -38L, -34L), Lon = c(-113L, -97L, -95L, -99L, -117L, -98L,
-98L, -95L, -112L, -120L, -114L, -81L, -117L, -90L, -109L, -115L,
-81L, -104L), Var1 = c(6L, 7L, 8L, 0L, 4L, 8L, 1L, 8L, 0L, 3L,
3L, 7L, 4L, 8L, 0L, 8L, 1L, 3L), Var2 = c(0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L)), .Names = c("SiteID",
"City", "StateBasedIn", "Lat", "Lon", "Var1", "Var2"), class = "data.frame", row.names = c(NA,
-18L))
And I would like to fill those values in by merging with another data frame (df2) that has 3 of the same columns, but not all of the columns that are in df1:
SiteID City StateBasedIn
1787 Lusby MD
2805 Springdale AR
3025 Saukville WI
3027 Saukville WI
3028 Saukville WI
3029 Saukville WI
df2 <- structure(list(SiteID = c(1787L, 2805L, 3025L, 3027L, 3028L,
3029L, 3030L, 3031L, 3033L), City = structure(c("Lusby", "Springdale",
"Saukville", "Saukville", "Saukville", "Saukville", "Saukville",
"Mequon", "Mequon"), .Dim = c(9L, 1L)), StateBasedIn = structure(c("MD",
"AR", "WI", "WI", "WI", "WI", "WI", "WI", "WI"), .Dim = c(9L,
1L))), row.names = c(NA, -9L), class = "data.frame", .Names = c("SiteID",
"City", "StateBasedIn"))
So basically I would retain all of the information in df1, and input the missing values that are available from df2. As I'm not too familiar with all of the dplyr options yet, I tried the different 'join' options but had no luck. I also tried to use 'merge' in the base package but still no success. Is there another way to do this (preferably with dplyr)?
You can use a full_join from dplyr, along with replace and coalesce to put together a pretty concise solution.
library(dplyr)
library(purrr)
# Cleaning from r2evans (if you want to keep it to dplyr just use r2evans lapply method
df1 <- mutate_if(df1, is.factor, as.character)
df2 <- dmap(df2, as.vector)
full_join(df1, df2, by = "SiteID") %>%
mutate_at(vars(matches("City","StateBased")), funs(replace(., . == "", NA))) %>%
mutate(City = coalesce(City.y, City.x),
StateBasedIn = coalesce(StateBasedIn.y, StateBasedIn.x)) %>%
select(-contains("."))
This solution is not very stylish, but at least it is a solution.
library(dplyr)
library(magrittr)
aux <- df1 %>%
# filter missing values
filter(City == "") %>%
# delete City and StateBasedIn so that the columns
# are not duplicates after the join
select(-c(City, StateBasedIn)) %>%
# inner join with the second dataframe
inner_join(df2, by = "SiteID") %>%
# change order of the columns
select(SiteID, City, StateBasedIn, Lat, Lon, Var1, Var2)
df1 %<>%
# filter all rows which values are not missing
filter(City != "") %>%
# bind the auxiliary dataframe
rbind(aux)
Results in:
SiteID City StateBasedIn Lat Lon Var1 Var2
1 4227 Richmond KY -39 -113 6 0
2 4987 Nashville TN -33 -97 7 0
3 4000 Newark NJ -39 -95 8 0
4 4925 Miami FL -40 -99 0 0
5 4437 Montgomery AL -32 -117 4 1
6 4053 Jonesboro AR -30 -98 8 1
7 4482 Arcata CA -38 -98 1 1
8 4037 Portland OR -31 -95 8 1
9 4020 Jackson MS -35 -112 0 1
10 1787 Lusby MD -38 -120 3 0
11 2805 Springdale AR -30 -114 3 1
12 3025 Saukville WI -39 -81 7 1
13 3027 Saukville WI -38 -117 4 0
14 3028 Saukville WI -32 -90 8 0
15 3029 Saukville WI -39 -109 0 1
16 3030 Saukville WI -31 -115 8 0
17 3031 Mequon WI -38 -81 1 1
18 3033 Mequon WI -34 -104 3 0
Slightly simplified version of Felix's answer.
First, repairing the data by changing factor to character, and removing the apparent matrices from the second one:
str(df1)
# 'data.frame': 18 obs. of 7 variables:
# $ SiteID : int 4227 4987 4000 4925 4437 4053 4482 4037 4020 1787 ...
# $ City : Factor w/ 10 levels "","Arcata","Jackson",..: 10 7 8 5 6 4 2 9 3 1 ...
# $ StateBasedIn: Factor w/ 10 levels "","AL","AR","CA",..: 6 10 8 5 2 3 4 9 7 1 ...
# $ Lat : int -39 -33 -39 -40 -32 -30 -38 -31 -35 -38 ...
# $ Lon : int -113 -97 -95 -99 -117 -98 -98 -95 -112 -120 ...
# $ Var1 : int 6 7 8 0 4 8 1 8 0 3 ...
# $ Var2 : int 0 0 0 0 1 1 1 1 1 0 ...
str(df2)
# 'data.frame': 9 obs. of 3 variables:
# $ SiteID : int 1787 2805 3025 3027 3028 3029 3030 3031 3033
# $ City : chr [1:9, 1] "Lusby" "Springdale" "Saukville" "Saukville" ...
# $ StateBasedIn: chr [1:9, 1] "MD" "AR" "WI" "WI" ...
df1 <- mutate_if(df1, is.factor, as.character)
df2[] <- lapply(df2, as.vector)
Now the work:
library(dplyr)
df1 %>%
left_join(select(df2, SiteID, cty = City, st = StateBasedIn), by = "SiteID") %>%
mutate(
City = ifelse(nzchar(City), City, cty),
StateBasedIn = ifelse(grepl("[^\\s]", StateBasedIn), StateBasedIn, st)
) %>%
select(-cty, -st)
# SiteID City StateBasedIn Lat Lon Var1 Var2
# 1 4227 Richmond KY -39 -113 6 0
# 2 4987 Nashville TN -33 -97 7 0
# 3 4000 Newark NJ -39 -95 8 0
# 4 4925 Miami FL -40 -99 0 0
# 5 4437 Montgomery AL -32 -117 4 1
# 6 4053 Jonesboro AR -30 -98 8 1
# 7 4482 Arcata CA -38 -98 1 1
# 8 4037 Portland OR -31 -95 8 1
# 9 4020 Jackson MS -35 -112 0 1
# 10 1787 Lusby MD -38 -120 3 0
# 11 2805 Springdale AR -30 -114 3 1
# 12 3025 Saukville WI -39 -81 7 1
# 13 3027 Saukville WI -38 -117 4 0
# 14 3028 Saukville WI -32 -90 8 0
# 15 3029 Saukville WI -39 -109 0 1
# 16 3030 Saukville WI -31 -115 8 0
# 17 3031 Mequon WI -38 -81 1 1
# 18 3033 Mequon WI -34 -104 3 0
I included two different ways to check for empty fields, uncertain if your example was conveniently clean in that regard; you can use either nzchar (empty vs non-empty) or the grepl("[^\\s]",...) solution (some non-whitespace present) easily. (Some data might also need is.na in the check ...)

Merging output in R

max=aggregate(cbind(a$VALUE,Date=a$DATE) ~ format(a$DATE, "%m") + cut(a$CLASS, breaks=c(0,2,4,6,8,10,12,14)) , data = a, max)[-1]
max$DATE=as.Date(max$DATE, origin = "1970-01-01")
Sample Data :
DATE GRADE VALUE
2008-09-01 1 20
2008-09-02 2 30
2008-09-03 3 50
.
.
2008-09-30 2 75
.
.
2008-10-01 1 95
.
.
2008-11-01 4 90
.
.
2008-12-01 1 70
2008-12-02 2 40
2008-12-28 4 30
2008-12-29 1 40
2008-12-31 3 50
My Expected output according to above table for only first month is :
DATE GRADE VALUE
2008-09-30 (0,2] 75
2008-09-02 (2,4] 50
Output in my real data :
format(DATE, "%m")
1 09
2 10
3 11
4 12
5 09
6 10
7 11
cut(a$GRADE, breaks = c(0, 2, 4, 6, 8, 10, 12, 14)) value
1 (0,2] 0.30844444
2 (0,2] 1.00000000
3 (0,2] 1.00000000
4 (0,2] 0.73333333
5 (2,4] 0.16983488
6 (2,4] 0.09368000
7 (2,4] 0.10589335
Date
1 2008-09-30
2 2008-10-31
3 2008-11-28
4 2008-12-31
5 2008-09-30
6 2008-10-31
7 2008-11-28
The output is not according to the sample data , as the data is too big . A simple logic is that there are grades from 1 to 10 , so I want to find the highest value for a month in the corresponding grade groups . Eg : I need a highest value for each group (0,2],(0,4] etc
I used an aggregate condition with function max and two grouping it by two columns Date and Grade . Now when I run the code and display the value of max , I get 3 tables as output one after the other. Now I want to plot this output but i am not able to do that because of this .So how can i merge all these output ?
Try:
library(dplyr)
a %>%
group_by(MONTH=format(DATE, "%m"), GRADE=cut(GRADE, breaks=seq(0,14,by=2))) %>%
summarise_each(funs(max))
# MONTH GRADE DATE VALUE
#1 09 (0,2] 2008-09-30 75
#2 09 (2,4] 2008-09-03 50
#3 10 (0,2] 2008-10-01 95
#4 11 (2,4] 2008-11-01 90
#5 12 (0,2] 2008-12-29 70
#6 12 (2,4] 2008-12-31 50
Or using data.table
library(data.table)
setDT(a)[, list(DATE=max(DATE), VALUE=max(VALUE)),
by= list(MONTH=format(DATE, "%m"),
GRADE=cut(GRADE, breaks=seq(0,14, by=2)))]
# MONTH GRADE DATE VALUE
#1: 09 (0,2] 2008-09-30 75
#2: 09 (2,4] 2008-09-03 50
#3: 10 (0,2] 2008-10-01 95
#4: 11 (2,4] 2008-11-01 90
#5: 12 (0,2] 2008-12-29 70
#6: 12 (2,4] 2008-12-31 50
Or using aggregate
res <- transform(with(a,
aggregate(cbind(VALUE, DATE),
list(MONTH=format(DATE, "%m") ,GRADE=cut(GRADE, breaks=seq(0,14, by=2))), max)),
DATE=as.Date(DATE, origin="1970-01-01"))
res[order(res$MONTH),]
# MONTH GRADE VALUE DATE
#1 09 (0,2] 75 2008-09-30
#4 09 (2,4] 50 2008-09-03
#2 10 (0,2] 95 2008-10-01
#5 11 (2,4] 90 2008-11-01
#3 12 (0,2] 70 2008-12-29
#6 12 (2,4] 50 2008-12-31
data
a <- structure(list(DATE = structure(c(14123, 14124, 14125, 14152,
14153, 14184, 14214, 14215, 14241, 14242, 14244), class = "Date"),
GRADE = c(1L, 2L, 3L, 2L, 1L, 4L, 1L, 2L, 4L, 1L, 3L), VALUE = c(20L,
30L, 50L, 75L, 95L, 90L, 70L, 40L, 30L, 40L, 50L)), .Names = c("DATE",
"GRADE", "VALUE"), row.names = c(NA, -11L), class = "data.frame")
Update
If you want to include YEAR also in the grouping
library(dplyr)
a %>%
group_by(MONTH=format(DATE, "%m"), YEAR=format(DATE, "%Y"), GRADE=cut(GRADE, breaks=seq(0,14, by=2)))%>%
summarise_each(funs(max))
# MONTH YEAR GRADE DATE VALUE
#1 09 2008 (0,2] 2008-09-30 75
#2 09 2008 (2,4] 2008-09-03 50
#3 09 2009 (0,2] 2009-09-30 75
#4 09 2009 (2,4] 2009-09-03 50
#5 10 2008 (0,2] 2008-10-01 95
#6 10 2009 (0,2] 2009-10-01 95
#7 11 2008 (2,4] 2008-11-01 90
#8 11 2009 (2,4] 2009-11-01 90
#9 12 2008 (0,2] 2008-12-29 70
#10 12 2008 (2,4] 2008-12-31 50
#11 12 2009 (0,2] 2009-12-29 70
#12 12 2009 (2,4] 2009-12-31 50
data
a <- structure(list(DATE = structure(c(14123, 14124, 14125, 14152,
14153, 14184, 14214, 14215, 14241, 14242, 14244, 14488, 14489,
14490, 14517, 14518, 14549, 14579, 14580, 14606, 14607, 14609
), class = "Date"), GRADE = c(1L, 2L, 3L, 2L, 1L, 4L, 1L, 2L,
4L, 1L, 3L, 1L, 2L, 3L, 2L, 1L, 4L, 1L, 2L, 4L, 1L, 3L), VALUE = c(20L,
30L, 50L, 75L, 95L, 90L, 70L, 40L, 30L, 40L, 50L, 20L, 30L, 50L,
75L, 95L, 90L, 70L, 40L, 30L, 40L, 50L)), .Names = c("DATE",
"GRADE", "VALUE"), row.names = c("1", "2", "3", "4", "5", "6",
"7", "8", "9", "10", "11", "12", "21", "31", "41", "51", "61",
"71", "81", "91", "101", "111"), class = "data.frame")
Following code using base R may be helpful (using 'a' dataframe from akrun's answer):
xx = strsplit(as.character(a$DATE), '-')
a$month = sapply(strsplit(as.character(a$DATE), '-'),'[',2)
gradeCats = cut(a$GRADE, breaks = c(0, 2, 4, 6, 8, 10, 12, 14))
aggregate(VALUE~month+gradeCats, data= a, max)
month gradeCats VALUE
1 09 (0,2] 75
2 10 (0,2] 95
3 12 (0,2] 70
4 09 (2,4] 50
5 11 (2,4] 90
6 12 (2,4] 50

Resources