Related
Here is a simplified version of the data I am working with:
data.frame(country = c("country1", "country2", "country3", "country1", "country2"), measurement = c("m1", "m1", "m1", "m2", "m2"),
y2015 = c(NA, 15, 19, 13, 55), y2016 = c(NA, 17, NA, 10, NA), y2017 = c(14, NA, NA, 9, 45), y2018 = c(18, 22, 16, NA, 40))
I am trying to take the difference between the two non-missing variables on either side of the NAs, and replace the missing values with the average of the differences over time.
For row 5, this would be something like c(55, 50, 45, 40).
However, it also needs to work for the rows that have more than one missing value in a sequence, like row 1 and row 3. For row 1, I'd like the difference between 14 and 18 to be interpolated, and so it should look something like c(6, 10, 14, 18). Meanwhile, for row 3, the difference between 19-13 divided between the two missing years, to look something like c(19, 18, 17, 16).
Essentially, I'm looking to create a slope for each country and measurement through the available years, and interpolating missing variables based on that.
I am trying to think of a package for this or perhaps create a loop. I have looked at the package 'spline' but does not seem to work since I want to run separate linear interpolation based on country and measurement.
Any thoughts would be greatly appreciated!
Use zoo::na.spline:
library(zoo)
dat[-c(1:2)] <- t(na.spline(t(dat[-c(1:2)])))
country measurement y2015 y2016 y2017 y2018
1 country1 m1 6 10 14.00000 18
2 country2 m1 15 17 19.33333 22
3 country3 m1 19 18 17.00000 16
4 country1 m2 13 10 9.00000 10
5 country2 m2 55 50 45.00000 40
I have a list -cj1- with multiple data frames
dput(head(cj1[1:2]))
list(structure(list(individual = c("a12TTT.pdf", "a15.pdf", "a17.pdf",
"a18.pdf", "a21.pdf", "a2TTT.pdf", "a5.pdf", "B11.pdf", "B12.pdf",
"B13.pdf", "B22.pdf", "B24.pdf", "B4.pdf", "B7.pdf", "B8.pdf",
"cw10-1.pdf", "cw13-1.pdf", "cw15-1TTT.pdf", "cw17-1.pdf", "cw18.pdf",
"cw3.pdf", "cw4.pdf", "cw7_1TTT.pdf"), id = 1:23, Ntot = c(13,
9, 16, 15, 9, 13, 10, 10, 11, 10, 14, 10, 11, 12, 11, 10, 15,
12, 14, 11, 9, 10, 11), N1 = c(5, 5, 10, 11, 7, 9, 5, 5, 6, 8,
8, 8, 9, 8, 7, 1, 0, 6, 3, 4, 2, 4, 2), ND = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), N0 = c(8,
4, 6, 4, 2, 4, 5, 5, 5, 2, 6, 2, 2, 4, 4, 9, 15, 6, 11, 7, 7,
6, 9), score = c(5.06923076923077, 4.96666666666667, 9.925, 10.86,
6.83333333333333, 8.88461538461539, 5, 5, 5.97272727272727, 7.82,
7.95714285714286, 7.82, 8.80909090909091, 7.9, 6.91818181818182,
1.24, 0.3, 6, 3.17142857142857, 4.08181818181818, 2.16666666666667,
4.06, 2.19090909090909), propscore = c(0.389940828402367, 0.551851851851852,
0.6203125, 0.724, 0.759259259259259, 0.683431952662722, 0.5,
0.5, 0.54297520661157, 0.782, 0.568367346938776, 0.782, 0.800826446280992,
0.658333333333333, 0.628925619834711, 0.124, 0.02, 0.5, 0.226530612244898,
0.371074380165289, 0.240740740740741, 0.406, 0.199173553719008
), theta = c(-0.571211122446447, 0.418736780198501, 0.464533662219296,
0.760432013134893, 1.43961032059382, 0.935963883364303, 0.0742361005467161,
0.416783201347136, 0.232586422933618, 1.65345248955369, 0.178947462869717,
1.3980442736112, 1.5300599487058, 0.340087410746963, 0.616985944469495,
-1.73246102772711, -4.06186172096556, -0.347700710331151, -1.21009964741398,
0.239145600406579, -1.88836418690337, -0.276451472526056, -0.611455626388059
), se.theta = c(0.689550115014498, 0.689441554709003, 0.595659709892116,
0.609506508256404, 0.917792293663691, 0.652011367164736, 0.720534163064516,
0.695969555549033, 0.661019531367007, 0.87050969318314, 0.605775647419845,
0.797443937820774, 0.768436114096332, 0.695748274310803, 0.709380679025605,
1.00089414765463, 1.8701468050665, 0.68959824350285, 0.733014089189809,
0.656392513303483, 0.952935324276941, 0.71608982789968, 0.771906532861938
), outfit = c(1.24922700170817, 1.46067763769417, 0.915183304626819,
0.753992664091072, 0.37410361433915, 0.727316037037668, 0.616907868814702,
1.01528298230254, 1.01594232662062, 0.616808170683195, 0.646097057961938,
0.622993494551005, 0.807441271101246, 0.788526018181888, 1.2157399735092,
0.341189086206191, 0.021052091633073, 0.543024513106335, 1.04183076617928,
1.1772656963046, 0.736106160865241, 0.756316095787985, 0.58320701094964
), infit = c(1.4078580948461, 1.42854494963967, 1.09762978932861,
0.893957122448352, 0.64936943769433, 0.899191443180872, 0.724956556509282,
1.14975990693782, 1.08074439712469, 0.978248081241133, 0.755557633771936,
0.823903684368671, 0.911855771375284, 0.954272320131035, 0.926253596526142,
0.634052701587448, 0.0504659822408584, 0.712539957033173, 0.966034039620798,
1.1901663169553, 0.81371119642719, 0.817417869881874, 0.737574872116582
)), row.names = c(NA, -23L), class = "data.frame"), structure(list(
parlabel = c("Ties", "Home"), par = c("delta", "eta"), est = c(-43.5016417611571,
0.337872999554289), se = c(366043197.615422, 0.215169736220537
)), row.names = c(NA, -2L), class = "data.frame"))
Here is how data frames look:
head(cj1[[1]],2)
individual id Ntot N1 ND N0 score propscore theta se.theta outfit
1 a12TTT.pdf 1 13 5 0 8 5.069231 0.3899408 -0.5712111 0.6895501 1.249227
2 a15.pdf 2 9 5 0 4 4.966667 0.5518519 0.4187368 0.6894416 1.460678
infit
1 1.407858
2 1.428545
I would like to create a separate list -results1- that would contain data frames that would include columns 1 and 9 named individual and theta
I tried:
results1<-sapply(cj1, "[",c("individual","theta") )
Error in [.data.frame(X[[i]], ...) : undefined columns selected
library(dplyr)
> results1 <- lapply(cj1, function(x) x%>% select(individual,theta))
Error:
Can't subset columns that don't exist.
x Column individual doesn't exist.
Run rlang::last_error() to see where the error occurred.
I can subtract these columns from one data frame:
cj[[1]][c(1,9)]
I could not apply this to the whole list.
You can use the following solution. We use .x to refer to every individual element of your list. Here .x can be each of your data frames of which we would like to select only 2 columns c("individual","theta").
However, since only one of your data frames contains such column names I used keep function to actually keep only elements whose data frames contain the desired column name. Just bear in mind for this form of coding which is called purrr-style formula we need ~ before .x. So you use map function which is an equivalent to lapply from base R and use this syntax to apply whatever function on every individual elements (data frames here).
library(purrr)
cj1 %>%
map_if(~ all(c("individual","theta") %in% names(.x)),
~ .x %>% select(individual, theta)) %>%
keep(~ all(c("individual","theta") %in% names(.x)))
[[1]]
individual theta
1 a12TTT.pdf -0.5712111
2 a15.pdf 0.4187368
3 a17.pdf 0.4645337
4 a18.pdf 0.7604320
5 a21.pdf 1.4396103
6 a2TTT.pdf 0.9359639
7 a5.pdf 0.0742361
8 B11.pdf 0.4167832
9 B12.pdf 0.2325864
10 B13.pdf 1.6534525
11 B22.pdf 0.1789475
12 B24.pdf 1.3980443
13 B4.pdf 1.5300599
14 B7.pdf 0.3400874
15 B8.pdf 0.6169859
16 cw10-1.pdf -1.7324610
17 cw13-1.pdf -4.0618617
18 cw15-1TTT.pdf -0.3477007
19 cw17-1.pdf -1.2100996
20 cw18.pdf 0.2391456
21 cw3.pdf -1.8883642
22 cw4.pdf -0.2764515
23 cw7_1TTT.pdf -0.6114556
Or we can spare a line of code to be more concise:
cj1 %>%
keep(~ all(c("individual","theta") %in% names(.x))) %>%
map(~ .x %>% select(individual, theta))
[[1]]
individual theta
1 a12TTT.pdf -0.5712111
2 a15.pdf 0.4187368
3 a17.pdf 0.4645337
4 a18.pdf 0.7604320
5 a21.pdf 1.4396103
6 a2TTT.pdf 0.9359639
7 a5.pdf 0.0742361
8 B11.pdf 0.4167832
9 B12.pdf 0.2325864
10 B13.pdf 1.6534525
11 B22.pdf 0.1789475
12 B24.pdf 1.3980443
13 B4.pdf 1.5300599
14 B7.pdf 0.3400874
15 B8.pdf 0.6169859
16 cw10-1.pdf -1.7324610
17 cw13-1.pdf -4.0618617
18 cw15-1TTT.pdf -0.3477007
19 cw17-1.pdf -1.2100996
20 cw18.pdf 0.2391456
21 cw3.pdf -1.8883642
22 cw4.pdf -0.2764515
23 cw7_1TTT.pdf -0.6114556
Here is just another base R solution with a slightly different syntax. Just note that \(x) is equivalent to function(x) which is a new feature available as of R. 4.1.0.
cj1 |>
lapply(\(x) {
if(all(c("individual","theta") %in% names(x))) {
`[`(x, c("individual","theta"))
}
}
) -> cj2
cj2 <- cj2[-which(sapply(cj2, is.null))] |> as.data.frame()
In base R, you can try this solution with lapply -
cols <- c("individual","theta")
lapply(cj1, function(x) if(all(cols %in% names(x))) x[cols])
#[[1]]
# individual theta
#1 a12TTT.pdf -0.5712
#2 a15.pdf 0.4187
#3 a17.pdf 0.4645
#4 a18.pdf 0.7604
#5 a21.pdf 1.4396
#6 a2TTT.pdf 0.9360
#7 a5.pdf 0.0742
#8 B11.pdf 0.4168
#9 B12.pdf 0.2326
#10 B13.pdf 1.6535
#11 B22.pdf 0.1789
#12 B24.pdf 1.3980
#13 B4.pdf 1.5301
#14 B7.pdf 0.3401
#15 B8.pdf 0.6170
#16 cw10-1.pdf -1.7325
#17 cw13-1.pdf -4.0619
#18 cw15-1TTT.pdf -0.3477
#19 cw17-1.pdf -1.2101
#20 cw18.pdf 0.2391
#21 cw3.pdf -1.8884
#22 cw4.pdf -0.2765
#23 cw7_1TTT.pdf -0.6115
#[[2]]
#NULL
If you want to drop the NULL lists you can add Filter -
Filter(length, lapply(cj1, function(x) if(all(cols %in% names(x))) x[cols]))
This question already has answers here:
Select the first and last row by group in a data frame
(6 answers)
Closed 2 years ago.
here little example
mydat=structure(list(a = c(8, 83, 8.5, 8.5, 7.5, 7.8, 7.5, 8, 7.5,
8, 8), b = c(69.5, 70, 69.5, 68.5, 70, 69.5, 69.5, 70, 69.5,
68.5, 70), PROB_POSTR_KM = c(378884L, 378884L, 378884L, 378884L,
378884L, 378884L, 404136L, 404136L, 404136L, 404136L, 404136L
)), class = "data.frame", row.names = c(NA, -11L))
Here variable PROB_POSTR_KM. It has value 378884 and count of this value=6
Value 404136 (count 5)
how to remove duplicate values if they exist and leave only the most recent one.
In this case, the desired result looks like
a b PROB_POSTR_KM
1 7.8 69.5 378884
2 8.0 70.0 404136
library(data.table)
setDT(mydat)
mydat[, tail(.SD, 1), PROB_POSTR_KM]
# PROB_POSTR_KM a b
# 1: 378884 7.8 69.5
# 2: 404136 8.0 70.0
Here is a dplyr solution:
library(dplyr)
mydat %>%
group_by(PROB_POSTR_KM) %>%
slice(which.max(1:n()))
Gives us:
# A tibble: 2 x 3
# Groups: PROB_POSTR_KM [2]
a b PROB_POSTR_KM
<dbl> <dbl> <int>
1 7.8 69.5 378884
2 8 70 404136
I have a dataset called CSES (Comparative Study of Electoral Systems) where each row corresponds to an individual (one interview in a public opinion survey), from many countries, in many different years .
I need to create a variable which identifies the ideology of the party each person voted, as perceived by this same person.
However, the dataset identifies this perceived ideology of each party (as many other variables) by letters A, B, C, etc. Then, when it comes to identify WHICH PARTY each person voted for, it has a UNIQUE CODE NUMBER, that does not correspond to these letters across different years (i.e., the same party can have a different letter in different years – and, of course, it is never the same party across different countries, since each country has its own political parties).
Fictitious data to help clarify, reproduce and create a code:
Let’s say:
country = c(1,1,1,1,2,2,2,2,3,3,3,3)
year = c (2000,2000,2004,2004, 2002,2002,2004,2008,2000,2000,2000,2000)
party_A_number = c(11,11,12,12,21,21,22,23,31,31,31,31)
party_B_number = c(12, 12, 11, 11, 22,22,21,22,32,32,32,32)
party_C_number = c(13,13,13,13,23,23,23,21,33,33,33,33)
party_voted = c(12,13,12,11,21,24,23,22,31,32,33,31)
ideology_party_A <- floor(runif (12, min=1, max=10))
ideology_party_B <- floor(runif (12, min=1, max=10))
ideology_party_C <- floor(runif (12, min=1, max=10))
Let’s call the variable I want to create “ideology_voted”:
I need something like:
IF party_A_number == party_voted THEN ideology_voted = ideology_party_A
IF party_B_number == party_voted, THEN ideology_voted == ideology_party_B
IF party_C_number == party_voted, THEN ideology_voted == ideology_party_C
The real dataset has 9 letters for (up to) 9 main parties in each country , dozens of countries and election-years. Therefore, it would be great to have a code where I could iterate through letters A-I instead of “if voted party A, then …; if voted party B then….”
Nevertheless, I am having trouble even when I try longer, repetitive codes (one transformation for each party letter - which would give me 8 lines of code)
library(tidyverse)
df <- tibble(
country = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3),
year = c(2000, 2000, 2004, 2004, 2002, 2002, 2004, 2008, 2000, 2000, 2000, 2000),
party_A_number = c(11, 11, 12, 12, 21, 21, 22, 23, 31, 31, 31, 31),
party_B_number = c(12, 12, 11, 11, 22, 22, 21, 22, 32, 32, 32, 32),
party_C_number = c(13, 13, 13, 13, 23, 23, 23, 21, 33, 33, 33, 33),
party_voted = c(12, 13, 12, 11, 21, 24, 23, 22, 31, 32, 33, 31),
ideology_party_A = floor(runif (12, min = 1, max = 10)),
ideology_party_B = floor(runif (12, min = 1, max = 10)),
ideology_party_C = floor(runif (12, min = 1, max = 10))
)
> df
# A tibble: 12 x 9
country year party_A_number party_B_number party_C_number party_voted ideology_party_A ideology_party_B
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2000 11 12 13 12 9 3
2 1 2000 11 12 13 13 2 6
3 1 2004 12 11 13 12 3 8
4 1 2004 12 11 13 11 7 8
5 2 2002 21 22 23 21 2 7
6 2 2002 21 22 23 24 8 2
7 2 2004 22 21 23 23 1 7
8 2 2008 23 22 21 22 7 7
9 3 2000 31 32 33 31 4 3
10 3 2000 31 32 33 32 7 5
11 3 2000 31 32 33 33 1 6
12 3 2000 31 32 33 31 2 1
# ... with 1 more variable: ideology_party_C <dbl>
It seems you're after conditioning using case_when:
ideology_voted <- df %>% transmute(
ideology_voted = case_when(
party_A_number == party_voted ~ ideology_party_A,
party_B_number == party_voted ~ ideology_party_B,
party_C_number == party_voted ~ ideology_party_C,
TRUE ~ party_voted
)
)
> ideology_voted
# A tibble: 12 x 1
ideology_voted
<dbl>
1 3
2 7
3 3
4 8
5 2
6 24
7 8
8 7
9 4
10 5
11 6
12 2
Note that the evaluation of case_when is lazy, so the first true condition is used (if it happens that more than one is actually true, say).
I am trying to calculate the number of weeks the inventory on hand will last given the sales projections for a dataset with 10s of million of rows. I have listed the expected output in the last column of the data structure given below. I also attached the implementation of this in Excel.
Logic
Weeksofsupply = Number of weeks the current inventory on hand will last.
example - in the attached image (SKU_CD 222, STORE_CD 33), the inventory on hand is 19, the sales values are
WK1 + WK2 = 15, Wk1 + Wk2 + Wk3 = 24, Which is greater than 19,
So we are picking 2, which the count of Weeks the current inventory will last.
Expected output in the last column
Data = structure(list(
SKU_CD = c(111, 111, 111, 111, 111, 111, 111,111, 111, 111, 111, 111, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222),
STORE_CD = c(22, 22, 22, 22, 22, 22, 22,22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33),
FWK_CD = c(201627, 201628, 201629, 201630, 201631, 201632,201633, 201634, 201635, 201636, 201637, 201638, 201627, 201628, 201629, 201630, 201631, 201632, 201633, 201634, 201635, 201636, 201637, 201638),
SALES = c(5, 2, 2, 2, 1, 3, 2, 2, 3, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 7, 5),
INVENTORY = c(29, 27, 25, 23, 22, 19, 17, 15, 12, 10, 25, 1, 19, 17, 15, 13, 12,9, 7, 5, 2, 0, 25, 18),
WeeksofSupply = c("11", "10", "9", "8", "8", "6", "5", "4", "3", "2", "Inventory More", "Inventory Less", "2", "2", "1", "1", "1", "Inventory Less", "Inventory Less", "Inventory Less", "Inventory Less", "Inventory Less", "Inventory More", "Inventory More")),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -24L),
.Names = c("SKU_CD", "STORE_CD", "FWK_CD", "SALES", "INVENTORY", "WeeksofSupply"))
Current Excel Code: (Here the weeks are shown in columns, but it should be rows like shown in the expected output.)
=IF(A2<SUM(B2:K2),SUMPRODUCT(--(SUBTOTAL(9,OFFSET(B2:K2,,,,COLUMN(B2:K2)-
COLUMN(B2)+1))<=A2))+LOOKUP(0,SUBTOTAL(9,OFFSET(B2:K2,,,,COLUMN(B2:K2)-
COLUMN(B2)+1))-B2:K2-A2,(A2-(SUBTOTAL(9,OFFSET(B2:K2,,,,COLUMN(B2:K2)-
COLUMN(B2)+1))-B2:K2))/B2:K2),IF(A2=SUM(B2:K2),COUNT(B2:K2),"Inventory
exceeds forecast"))
I would appreciate any input to implement this efficiently in R. Many Thanks for your time!
For your revised data in long format, you can do the following...
library(dplyr) #for the grouping functionality
#define a function to calculate weeks Supply from Sales and Inventory
weekSup <- function(sales,inv){
sales <- unlist(sales)
inv <- unlist(inv)
n <- length(sales)
weeksup <- rep(NA,n)
for(i in 1:n){
if(i==n | inv[i]<sales[i]){
weeksup[i] <- ifelse(inv[i]>sales[i],NA,inv[i]/sales[i])
} else {
weeksup[i] <- approxfun(cumsum(sales[i:n]),1:(n-i+1))(inv[i])
}
}
#Your 'inventory more' is coded as -1 (a number) to avoid whole column being forced to a character vector
weeksup <- replace(weeksup,is.na(weeksup),-1)
return(weeksup) #for whole weeks, change this to `return(floor(weeksup))`
}
Data2 <- Data %>% group_by(SKU_CD,STORE_CD) %>% mutate(weekSup=weekSup(SALES,INVENTORY))
head(Data2,20)
SKU_CD STORE_CD FWK_CD SALES INVENTORY WeeksofSupply weekSup
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
1 111 22 201627 5 29 11 11.3333333
2 111 22 201628 2 27 10 10.8333333
3 111 22 201629 2 25 9 9.8333333
4 111 22 201630 2 23 8 8.8333333
5 111 22 201631 1 22 8 8.0000000
6 111 22 201632 3 19 6 6.6666667
7 111 22 201633 2 17 5 5.8333333
8 111 22 201634 2 15 4 4.8333333
9 111 22 201635 3 12 3 3.6666667
10 111 22 201636 2 10 2 2.8333333
11 111 22 201637 3 25 Inventory More -1.0000000
12 111 22 201638 6 1 Inventory Less 0.1666667
13 222 33 201627 7 19 2 2.4444444
14 222 33 201628 8 17 2 2.0000000
15 222 33 201629 9 15 1 1.6000000
16 222 33 201630 10 13 1 1.2727273
17 222 33 201631 11 12 1 1.0833333
18 222 33 201632 12 9 Inventory Less 0.7500000
19 222 33 201633 13 7 Inventory Less 0.5384615
20 222 33 201634 14 5 Inventory Less 0.3571429
Here is one way to do it, using the linear interpolation method approxfun...
data$WeeksSupply <- sapply(1:nrow(data),function(i)
approxfun(cumsum(as.vector(c(data[i,2:11]))),1:10)(data$Inventory[i]))
data$WeeksSupply <- replace(data$WeeksSupply,is.na(data$WeeksSupply),
"Inventory Exceeds Forecast")
data
# A tibble: 2 x 12
Inventory Wk1 Wk2 Wk3 Wk4 Wk5 Wk6 Wk7 Wk8 Wk9 Wk10 WeeksSupply
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 200 20 15 25 40 35 45 30 50 45 55 6.66666666666667
2 2000 20 15 25 40 35 45 30 50 45 55 Inventory Exceeds Forecast