Procedure to update with Cursor got partial update result - plsql

I don't know why my code only updated 2 out of 8 rows selected from cursor. The procedure was supposed to update all the product cost that are lower than the new calculated cost, to the new calculated cost. But it only updated 2 rows. (Each Product has many parts, AND each part can be used in multiple products.)
CREATE OR REPLACE PROCEDURE PR_Update_Curr_Cost
AS
CURSOR C_Prod_Cost IS
SELECT pd.Product_ID, Product_Cost, SUM(Qty_Needed*Current_Cost)
FROM Product pd, Product_Part pp, Part pa
WHERE pd.Product_ID = pp.Product_ID AND
pp.Part_ID = pa.Part_ID
GROUP BY pd.Product_ID, Product_Cost
HAVING SUM(Qty_Needed*Current_Cost) > Product_Cost
ORDER BY pd.Product_ID;
V_Product_ID NUMBER(5,0);
V_Prod_Cost NUMBER(6,2);
V_Cur_Prod_Cost NUMBER(6,2);
BEGIN
OPEN C_Prod_Cost;
LOOP
FETCH C_Prod_Cost INTO V_Product_ID, V_Prod_Cost, V_Cur_Prod_Cost;
EXIT WHEN C_Prod_Cost%NOTFOUND;
UPDATE Product
SET Product_Cost = V_Cur_Prod_Cost
WHERE Product_ID = V_Product_ID;
END LOOP;
CLOSE C_Prod_Cost;
EXCEPTION
WHEN no_data_found THEN
CLOSE C_Prod_Cost;
RAISE_APPLICATION_ERROR(-20999, 'All product cost are up to date. No change needed.');
WHEN invalid_cursor THEN
CLOSE C_Prod_Cost;
RAISE_APPLICATION_ERROR(-20998, 'Cursor error has occured. Contact techical support');
WHEN others THEN
CLOSE C_Prod_Cost;
RAISE_APPLICATION_ERROR(-20997, 'An undetermined error has occured. Contact techical support');
END PR_Update_Curr_Cost;
Before the update:
Product_ID Product_Cost New Calculated Production Cost
2 .62 .7875
3 1 3.31625
4 3.4 12.94125
5 .64 1.12
7 7.46 12.466125
8 1.32 6.444375
9 .27 2.35375
10 .44 2.5375
8 rows selected.
After the update:
Product_ID Product_Cost New Calculated Production Cost
3 1 3.31625
4 3.4 12.94125
5 .64 1.12
8 1.32 6.444375
9 .27 2.35375
10 .44 2.5375
6 rows selected.
LOAD TABLE:
Drop Table Product_Part;
Drop Table Product;
Drop Table Part;
Create Table Product
(
Product_ID Number(5,0) Constraint PK_Pr_PrID primary key
Constraint NN_Pr_PrID not null,
Product_Name Varchar2(50) Constraint NN_Pr_PrName not null,
Product_Cost Number(6,2) Default '0'
Constraint NN_Pr_PrCost not null
Constraint CK_Pr_PrCost
Check (Product_Cost >= 0)
);
Create Table Part
(
Part_ID Number(4,0) Constraint PK_Pa_PaID primary key
Constraint NN_Pa_PaID not null,
Part_Name Varchar2(40) Constraint NN_Pa_PaName not null,
Current_Cost Number(5,2) Default '0'
Constraint NN_Pa_CurCost not null
Constraint CK_Pa_CurCost
Check (Current_Cost >=0)
);
Create Table Product_Part
(
Product_ID Number(5,0) Constraint FK_PP_PrID_Pr_PrID
References Product (Product_ID)
Constraint NN_PP_PrID not null,
Part_ID Number(4,0) Constraint FK_PP_PaID_Pa_PaID
References Part (Part_ID)
Constraint NN_PP_PaID not null,
Qty_Needed Number(6,2) Default '1'
Constraint NN_PP_QtyNeeded not null,
Constraint PK_PP primary key (Product_ID, Part_ID)
);
-- Insert Into Product
Insert Into Product
(Product_ID, Product_Name, Product_Cost)
Values
(1,'Coffee (Fair Trade)', 0.27);
Insert Into Product
(Product_ID, Product_Name, Product_Cost)
Values
(10, 'Chocolate Sundae', 0.44);
Insert Into Product
(Product_ID, Product_Name, Product_Cost)
Values
(2,'Coke (591 ml)', 0.62);
Insert Into Product
(Product_ID, Product_Name, Product_Cost)
Values
(8,'Hamburger Deluxe',1.32);
Insert Into Product
(Product_ID, Product_Name, Product_Cost)
Values
(3,'Bayou Seafood Gumbo (small)',1.00);
Insert Into Product
(Product_ID, Product_Name, Product_Cost)
Values
(4,'Hot Louisiana Crab Dip',3.40);
Insert Into Product
(Product_ID, Product_Name, Product_Cost)
Values
(6,'Fettuccine Alfredo', 4.01);
Insert Into Product
(Product_ID, Product_Name, Product_Cost)
Values
(7,'Roasted Salmon with Lemon Thyme Crust', 7.46);
Insert Into Product
(Product_ID, Product_Name, Product_Cost)
Values
(5,'Caesar Salad (starter size)', 0.64);
Insert Into Product
(Product_ID, Product_Name, Product_Cost)
Values
(9,'Oven Fried Potatoes', 0.27);
-- Insert Into Part
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(35, 'Potatoes', 0.04);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(10, 'Oil', 0.13);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(22, 'Salmon Steak (4 oz)', 3.28);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(11, 'Ketchup', 0.11);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(34, 'Pepper', 0.25);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(12, 'White Sugar', 0.08);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(20, 'Fresh Tiger Prawns', 0.26);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(21, 'Fettuccine', 0.56);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(13, 'Cream', 0.25);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(14, 'Coke (575 ml)', 0.45);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(17, 'Eggs', 0.11);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(18, 'Romaine Lettuce', 1.25);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(19, 'Vinegar', 0.18);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(23, 'Mustard', 0.18);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(24, 'French Fries', 0.07);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(25, 'Ice Cream', 0.35);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(36, 'Diced Organic Tomatoes', 0.17);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(37, 'Relish', 0.07);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(38, 'Almonds', 0.05);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(26, 'Crushed Almonds', 0.21);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(28, 'Taragon', 2.24);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(27, 'Chocolate Sauce', 0.34);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(29, 'Coffee (Organic Fair Trade)', 0.22);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(30, 'Hamburger', 0.93);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(15, 'Chicken Breast Filet', 0.48);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(16, 'Alaskan King Crab Meat', 3.45);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(31, 'Salt', 0.21);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(32, 'Tea', 0.80);
Insert Into Part
(Part_ID, Part_Name, Current_Cost)
Values
(33, 'Bun', 0.12);
-- Insert Into Product_Part
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(4, 16, 2.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(2, 14, 1.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(3, 13, 1.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(3, 34, 0.50);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(3, 15, 1.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(3, 20, 4.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(1, 13, 0.10);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(1, 12, 0.10);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(1, 29, 0.40);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(8, 30, 3.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(10, 26, 0.50);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(10, 25, 3.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(9, 35, 4.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(7, 34, 0.05);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(7, 12, 1.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(7, 23, 1.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(7, 35, 2.50);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(7, 36, 1.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(7, 31, 0.10);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(7, 22, 2.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(8, 33, 1.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(9, 10, 0.50);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(5, 10, 1.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(10, 27, 0.50);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(5, 19, 0.50);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(6, 13, 2.25);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(8, 24, 3.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(8, 36, 1.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(8, 37, 0.25);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(6, 17, 2.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(8, 10, 0.50);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(5, 36, 1.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(8, 11, 2.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(5, 18, 0.20);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(4, 17, 2.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(4, 34, 0.10);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(4, 13, 1.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(6, 21, 1.00);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(10, 38, 2.50);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(8, 23, 0.50);
Insert Into Product_Part
(Product_ID, Part_ID, Qty_Needed)
Values
(9, 28, 0.50);

This works fine for me. I ran the ddl/dml script to create the objects, then checked what rows would be updated by the procedure by running the query from the cursor:
SELECT pd.Product_ID, Product_Cost, SUM(Qty_Needed*Current_Cost)
FROM Product pd, Product_Part pp, Part pa
WHERE pd.Product_ID = pp.Product_ID AND
pp.Part_ID = pa.Part_ID
GROUP BY pd.Product_ID, Product_Cost
HAVING SUM(Qty_Needed*Current_Cost) > Product_Cost
ORDER BY pd.Product_ID;
3 1 1.895
4 3.4 7.395
8 1.32 3.6825
9 0.27 1.345
10 0.44 1.45
Then I ran the procedure to update the rows:
DECLARE
CURSOR C_Prod_Cost IS
SELECT pd.Product_ID, Product_Cost, SUM(Qty_Needed*Current_Cost)
FROM Product pd, Product_Part pp, Part pa
WHERE pd.Product_ID = pp.Product_ID AND
pp.Part_ID = pa.Part_ID
GROUP BY pd.Product_ID, Product_Cost
HAVING SUM(Qty_Needed*Current_Cost) > Product_Cost
ORDER BY pd.Product_ID;
V_Product_ID NUMBER(5,0);
V_Prod_Cost NUMBER(6,2);
V_Cur_Prod_Cost NUMBER(6,2);
BEGIN
OPEN C_Prod_Cost;
LOOP
FETCH C_Prod_Cost INTO V_Product_ID, V_Prod_Cost, V_Cur_Prod_Cost;
EXIT WHEN C_Prod_Cost%NOTFOUND;
UPDATE Product
SET Product_Cost = V_Cur_Prod_Cost
WHERE Product_ID = V_Product_ID;
END LOOP;
CLOSE C_Prod_Cost;
EXCEPTION
WHEN no_data_found THEN
CLOSE C_Prod_Cost;
RAISE_APPLICATION_ERROR(-20999, 'All product cost are up to date. No change needed.');
WHEN invalid_cursor THEN
CLOSE C_Prod_Cost;
RAISE_APPLICATION_ERROR(-20998, 'Cursor error has occured. Contact techical support');
WHEN others THEN
CLOSE C_Prod_Cost;
RAISE_APPLICATION_ERROR(-20997, 'An undetermined error has occured. Contact techical support');
END;
/
and checked the original query. The result was
8 3.68 3.6825
So ... seems that due to a rounding issue one record was not updated to the correct value but other than that, the procedure updated all the rows that needed updating. Why are you saying it doesn't work ? How did you come to that conclusion ?

Related

How to match one row from one column to the next 5-10 rows in another column in R?

I have a table in R (Click on the link below (See my table here)) that shows observations of two events per day: observ1 and observ2. I would like to add a third column to that called 'check'. In column check, I should get a TRUE value if observ1 equals 1 and after 5 to 10 days, observ2 also equals 1.
As you see in the table, check value on row 14 is TRUE. The reason is that observ1 was 1 on row 6 and then after 9 days, observ2 also was 1.
I do not know how to code this in R and get out column 'check'. Appreciate any assistance!
See my table here
this is not considered a good way to ask a question, generally most posters will use dput() on their data.frame to provide a sample of their data to upload in the question. The result of this function is copied and pasted from the console in the format I have done below (see data). For future questions it is considered good practice. At any rate hope this solutions helps:
Base R solution:
df1$check <- with(
df1,
vapply(
seq_along(observ2),
function(i){
if(i - 5 <= 0){
NA
}else{
ir <- max(i-10, 1)
ir2 <- (any(observ1[ir:(i-5)] == 1) & observ2[i] == 1)
ifelse(ir2, ir2, NA)
}
},
logical(1)
)
)
Data:
df1 <- structure(list(day = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20), observ1 = c(1, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), observ2 = c(0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -20L))

Finding differences between populations

I have data equivalent data from 2019 and 2020. The proportion of diagnoses in 2020 look like they differ from 2019, but I'd like to ...
a) statistically test the populations are different.
b) determine which categories are the most different.
I've worked out I can do 'a' using:
chisq.test(test$count.2020, test$count.2019)
I don't know how to find out which categories are the ones that are the most different between 2020 and 2019. Any help would be amazing, thanks!
diagnosis <- data.frame(mf_label = c("Audiovestibular", "Autonomic", "Cardiovascular",
"Cerebral palsy", "Cerebrovascular", "COVID", "Cranial nerves",
"CSF disorders", "Developmental", "Epilepsy and consciousness",
"Functional", "Head injury", "Headache", "Hearing loss", "Infection",
"Maxillofacial", "Movement disorders", "Muscle and NMJ", "Musculoskeletal",
"Myelopathy", "Neurodegenerative", "Neuroinflammatory", "Peripheral nerve",
"Plexopathy", "Psychiatric", "Radiculopathy", "Spinal", "Syncope",
"Toxic and nutritional", "Tumour", "Visual system"),
count.2019 = c(5, 0, 1, 1, 2, 0, 4, 3, 0, 7, 4, 0, 24, 0, 0, 2, 22, 3, 3, 0, 3, 18, 12, 0, 0, 2, 2, 0, 1, 4, 0),
count.2020 = c(5, 1, 1, 3, 28, 9, 11, 13, 1, 13, 30, 5, 68, 1, 1, 2, 57, 14, 5, 8, 16, 37, 27, 3, 13, 17, 3, 1, 8, 13, 11))
Your Chi square test is not correct. You need to provide the counts as a table or matrix, not as two separate vectors. Because you have very small expected values for half of the cells, you need to use simulation to estimate the p-value:
results <- chisq.test(diagnosis[, 2:3], simulate.p.value=TRUE)
The overall table is barely significant at .05. The chisq.test function returns a list including the original data, the expected values, residuals, and standardized residuals. The manual page describes these (?chisq.test) and provides some citations for more details.

How can I calculate weighted standard errors and plot them in a bar plot?

I have a data frame of counts. I would like to calculate weighted proportions, plot the proportions, and also plot standard error bars for these weighted proportions.
Sample of my data frame:
head(df[1:4,])
badge year total b_1 b_2 b_3 b_4 b_5 b_6 b_7 b_8 b_9 b_10
1 15 2014 14 3 2 1 1 1 1 1 1 1 1
2 15 2015 157 13 12 11 8 6 6 6 5 5 5
3 15 2016 15 5 3 1 1 1 1 1 1 1 0
4 2581 2014 13 1 1 1 1 1 1 1 1 1 1
The data contain counts of 911 calls officers respond to in ten different police beats (b_1, b_2,...) in a given year. So officer 15 responds to 14 calls total in 2014, 3 of which were in beat 1, 2 in beat 2, and so on.
Essentially, what I want is to get the overall proportion of calls that occur within each beat. But I want these proportions to be weighted by the total number of calls.
So far, I've been able to calculate this by just adding the values within each b_ column and the total column, and calculating proportions. I have plotted these in a simple bar plot. I am haven't been able to figure out how to calculate standard errors that are weighted by total.
I have no preference for how the data are plotted. I'm mainly interested in getting the right standard errors.
Here is the code I have so far:
sums_by_beat <- apply(df[, grep('b_', colnames(df2))], 2, sum)
props_by_beat <- sums_by_beat / sum(df$total)
# Bar plot of proportions by beat
barplot(props_by_beat, main='Distribution of Calls by Beat',
xlab="Nth Most Common Division", ylim=c(0,1),
names.arg=1:length(props_by_beat), ylab="Percent of Total Calls")
And a 30-row sample of my data:
df <- structure(list(badge = c(15, 15, 15, 2581, 2581, 2745, 2745,
3162, 3162, 3162, 3396, 3650, 3650, 3688, 3688, 3688, 3698, 3698,
3698, 3717, 3717, 3717, 3740, 3740, 3740, 3813, 3873, 3907, 3930,
4007), year = c(2014, 2015, 2016, 2014, 2015, 2015, 2016, 2014,
2015, 2016, 2016, 2014, 2015, 2014, 2015, 2016, 2014, 2015, 2016,
2014, 2015, 2016, 2014, 2015, 2016, 2016, 2015, 2014, 2014, 2014
), total = c(14, 157, 15, 13, 29, 1, 1, 754, 1172, 1039, 14,
1, 2, 34, 57, 146, 3, 7, 28, 593, 1036, 1303, 461, 952, 1370,
1, 4, 41, 5, 451), b_1 = c(3, 13, 5, 1, 3, 1, 1, 33, 84, 83,
2, 1, 2, 5, 10, 14, 2, 7, 7, 39, 72, 75, 42, 69, 81, 1, 1, 7,
1, 36), b_2 = c(2, 12, 3, 1, 2, 0, 0, 33, 61, 52, 2, 0, 0, 3,
6, 8, 1, 0, 2, 37, 65, 70, 29, 65, 75, 0, 1, 5, 1, 23), b_3 = c(1,
11, 1, 1, 2, 0, 0, 32, 57, 45, 2, 0, 0, 3, 5, 8, 0, 0, 2, 34,
62, 67, 28, 50, 73, 0, 1, 3, 1, 22), b_4 = c(1, 8, 1, 1, 2, 0,
0, 31, 44, 39, 2, 0, 0, 3, 3, 7, 0, 0, 2, 34, 61, 67, 26, 42,
72, 0, 1, 3, 1, 21), b_5 = c(1, 6, 1, 1, 1, 0, 0, 30, 42, 37,
1, 0, 0, 3, 3, 7, 0, 0, 1, 33, 53, 61, 23, 42, 67, 0, 0, 2, 1,
21), b_6 = c(1, 6, 1, 1, 1, 0, 0, 30, 40, 36, 1, 0, 0, 2, 2,
6, 0, 0, 1, 32, 53, 61, 22, 41, 63, 0, 0, 2, 0, 21), b_7 = c(1,
6, 1, 1, 1, 0, 0, 26, 39, 35, 1, 0, 0, 2, 2, 6, 0, 0, 1, 30,
47, 58, 22, 39, 62, 0, 0, 2, 0, 21), b_8 = c(1, 5, 1, 1, 1, 0,
0, 26, 39, 33, 1, 0, 0, 2, 2, 6, 0, 0, 1, 30, 47, 58, 21, 38,
59, 0, 0, 2, 0, 19), b_9 = c(1, 5, 1, 1, 1, 0, 0, 24, 34, 33,
1, 0, 0, 2, 2, 5, 0, 0, 1, 30, 43, 57, 20, 37, 57, 0, 0, 2, 0,
15), b_10 = c(1, 5, 0, 1, 1, 0, 0, 23, 34, 32, 1, 0, 0, 1, 2,
5, 0, 0, 1, 27, 40, 56, 18, 36, 55, 0, 0, 2, 0, 14)), row.names = c(NA,
30L), class = "data.frame")
There isn't (as far as I know) a built-in R function to calculate the standard error of a weighted mean, but it is fairly straightforward to calculate - with some assumptions that are probably valid in the case you describe.
See, for instance:
https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Standard_error
Standard error of the weighted mean
If the elements used to calculate the weighted mean are samples from populations that all have the same variance v, then the variance of the weighted sample mean is estimated as:
var_m = v^2 * sum( wnorm^2 ) # wnorm = weights normalized to sum to 1
And the standard error of the weighted mean is equal to the square root of the variance.
sem = sqrt( var_m )
So, we need to calculate the sample variance from the weighted data.
Weighted variance
The weighted population variance (or biased sample variance) is calculated as:
pop_v = sum( w * (x-mean)^2 ) / sum( w )
However, if (as in the case you describe), we are working with samples taken from the population, rather then with the population itself, we need to make an adjustment to obtain an unbiased sample variance.
If the weights represent the frequencies of observations underlying each of the elements used to calculate the weighted mean & variance, then the adjustment is:
v = pop_v * sum( w ) / ( sum( w ) -1 )
However, this is not the case here, as the weights are the total frequenceis of 911 calls for each policeman, not the calls for each beat. So in this case the weights correspond to the reliabilities of each element, and the adjustment is:
v = pop_v * sum( w )^2 / ( sum( w )^2 - sum( w^2) )
weighted.var and weighted.sem functions
Putting all this together, we can define weighted.var and weighted.sem functions, similar to the base R weighted.mean function (note that several R packages, for instance "Hmisc", already include more-versatile functions to calculate the weighted variance):
weighted.var = function(x,w,type="reliability") {
m=weighted.mean(x,w)
if(type=="frequency"){ return( sum(w*(x-m)^2)/(sum(w)-1) ) }
else { return( sum(w*(x-m)^2)*sum(w)/(sum(w)^2-sum(w^2)) ) }
}
weighted.sem = function(x,w,...) { return( sqrt(weighted.var(x,w,...)*sum(w^2)/sum(w)^2) ) }
applied to 911 call data in the question
In the case of the question, the elements from which we want to calculate the weighted mean and weighted sem correspond to the proportions of calls in each beat, for each policeman.
So (finally...):
props = t(apply(df,1,function(row) row[-(1:3)]/row[3]))
wmean_props = apply(props,2,function(col) weighted.mean(col,w=df[,3]))
wsem_props = apply(props,2,function(col) weighted.sem(col,w=df[,3]))
Aren't your "proportions" actually the mean of the weighted (by total) observations? Then we could simply calculate the weighted colMeans accordingly.
df2 <- df[, grep('b_', colnames(df))]
means.w <- colMeans(df2 / df$total)
For the error bars we could use the quantiles of 1 - alpha/2, i.e. for alpha==.05 we use c(.025, .975). The analytical sds would yield negative values.
q.w <- t(apply(df2 / df$total, 2, quantile, c(.025, .975)))
Now, we store the x-positions that barplot returns invisible,
# Bar plot of proportions by beat
b <- barplot(means.w, main='Distribution of Calls by Beat',
xlab="Nth Most Common Division", ylim=c(0,1),
names.arg=1:length(means.w), ylab="Percent of Total Calls")
and construct the error bars with arrows.
arrows(b, q.w[,1], b, q.w[,2], length=.02, angle=90, code=3)

Drop column if it has an observation is equal to ZERO

I have a very big dataframe with 150000 rows and 1000s columns. A subset is as follow:
df <- data.frame(col1 = c('201507', '201508', '201509', '201510', '201511', '201512', '201601', '201602', '201603'),
col2 = c(12, 45, 6, 23, 17, 32, 67, 23, 12),
col3 = c(0, 0, 12, 0, 67, 34, 87, 19, 9),
col4 = c(4584, 3423, 6723, 1245, 3234, 14577, 213, 557, 5677),
col5 = c(134, 345, 0, 23, 93, 48, 12, 21, 0))
I want to drop any column where:
it has ZERO value at any row (for example col3 and col5)
the first row of the column is ZERO (for example only `col3).
I know this is a simple example but I have 1000s of columns

How to UPDATE SQLite column while using JOIN

For homework, I am currently trying to figure out how to update a column while using JOIN but I can't seem to get it right.
The first part is this:
The 'company' you work for has decided to hire a new bi-lingual support position. Your job is to locate all users who have purchased a Spanish language track so they can be assigned to a new support representative. Sales have begun to identify all the albums that are classified as Spanish language, so far they have found AlbumId's of 8, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34, 41, 42, 45, 47, 52, 53.
I solved this with this query:
SELECT * FROM customers
JOIN invoices USING (CustomerId)
JOIN invoice_items USING (InvoiceId)
JOIN tracks USING (TrackId)
WHERE tracks.AlbumId IN (8, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34, 41, 42, 45, 47, 52, 53)
GROUP BY invoices.InvoiceId;
Now the second part is this:
Help out the sales team by modifying your query from Part 1. Instead of just listing all the customers, it should update the customer's assigned support representative. The new support representative's id is 6.
I tried running this:
UPDATE customers
SET SupportRepId = 6
WHERE(SELECT * FROM customers
JOIN invoices USING (CustomerId)
JOIN invoice_items USING (InvoiceId)
JOIN tracks USING (TrackId)
WHERE tracks.AlbumId IN (8, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34, 41, 42, 45, 47, 52, 53)
GROUP BY invoices.InvoiceId);
But I am getting an error that says:
SQLITE_ERROR: sub-select returns 33 columns - expected 1
errno: 1
code: SQLITE_ERROR
name: Error
I got this working by using the following command:
UPDATE customers
SET SupportRepId = 6
WHERE CustomerId IN (
SELECT customers.CustomerId FROM customers
JOIN invoices USING (CustomerId)
JOIN invoice_items USING (InvoiceId)
JOIN tracks USING (TrackId)
WHERE AlbumId IN (8, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34, 41, 42, 45, 47, 52, 53)
);
The last error is the clue. WHERE checks for 1 value; 0 is false, positive is true. * is returning 33 values (columns).
To cater for multiple updates you would need to use a correlated subquery e.g. WHERE IN or WHERE EXISTS that will provide a result for each row processed by the update.
Using WHERE EXISTS the query could be :-
UPDATE customers
SET SupportRepId = 6
WHERE EXISTS (
SELECT 1 FROM customers AS B
JOIN invoices USING (CustomerId)
JOIN invoice_items USING (InvoiceId)
JOIN tracks USING (TrackId)
WHERE tracks.AlbumId IN (8, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34, 41, 42, 45, 47, 52, 53)
AND B.CustomerId = customers.CustomerId
);
Thus for each row processed by the UPDATE 1 will be returned if the customer has any invoice that has a track from one of the listed albums AND if the customerId from the correlatedsubquery (B.CustomerId) is the same as the CustomerId of the row being updated.
Using WHERE IN the query could be :-
UPDATE customers
SET SupportRepId = 6
WHERE CustomerId IN (
SELECT CustomerId FROM customers
JOIN invoices USING (CustomerId)
JOIN invoice_items USING (InvoiceId)
JOIN tracks USING (TrackId)
WHERE tracks.AlbumId IN (8, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34, 41, 42, 45, 47, 52, 53)
);

Resources