"RecursionError: maximum recursion depth exceeded" while adding a new constraints to a convex problem in cvxpy - constraints

I'm new in cvxpy and working on a certain energy optimization model I'm trying to add new constraints. When I try to append the new constraints i get the following error "RecursionError: maximum recursion depth exceeded". If I exclude the last lines related (rules.appen(...)) everything works. I have tried to increase the system recursion limit but this doesn't work. Any suggestion?
class ProductionRamping(Constraint):
# it runs on both operation and planning, single- and multi-region
def __get_energy_prod_differences(self):
"""Creates variables to constrain the increase and decrease
in energy production when passing from one timestep to the next"""
# Create a dataframe containing the differences between the energy production of each tech
# in each timestep and the production in the previous timestep
energy_prod_differences = {}
# for each region
for reg in self.model_data.settings.regions:
energy_prod_differences_regional = {}
#for each tech, except demand, transmission, and storage
for tech_type in self.model_data.settings.technologies[reg].keys():
if tech_type == "Demand" or tech_type == "Transmission" or tech_type == "Storage":
continue
# select the first row
first_row = cp.reshape(
self.variables.technology_prod[reg][tech_type][0,:],
(1, self.variables.technology_prod[reg][tech_type][0,:].shape[0])
)
# in the case the timesteps are just one, the shifted dataframe containing the energy production
# will only be the first row; if timesteps are > 1 instead we have a shifted dataframe
if self.variables.technology_prod[reg][tech_type].shape[0] > 1:
shifted_prod = cp.vstack([first_row, (self.variables.technology_prod[reg][tech_type][:-1,:])])
else:
shifted_prod = first_row
# compute the difference between the energy production in each timestep and the the production in the previouos timestep
energy_prod_differences_regional[tech_type] = self.variables.technology_prod[reg][tech_type] - shifted_prod
energy_prod_differences[reg] = energy_prod_differences_regional
return energy_prod_differences ## 8760*nyears x ntechs
def transform_total_capacity(capacity, regions, timeslice_fraction, years):
totcapacity ={}
for reg in regions:
totcapacity_regional = {}
for tech_type in capacity[reg].keys():
if tech_type == "Demand" or tech_type == "Transmission" or tech_type == "Storage":
continue
totcapacity_hourly = cp.reshape(
capacity[reg][tech_type][0,:],
(1, capacity[reg][tech_type][0,:].shape[0]))
totcapacity_yearly = totcapacity_hourly
for hours in np.arange(len(timeslice_fraction)-1):
totcapacity_yearly = cp.vstack([totcapacity_yearly,totcapacity_hourly])
totcapacity_multiyear_regional = totcapacity_yearly
for indx in np.arange(len(years)-1):
totcapacity_hourly1 = cp.reshape(
capacity[reg][tech_type][indx,:],
(1, capacity[reg][tech_type][indx,:].shape[0]))
totcapacity_yearly1 = totcapacity_hourly1
for hours in np.arange(len(timeslice_fraction)-1):
totcapacity_yearly1 = cp.vstack([totcapacity_yearly1,totcapacity_hourly1])
totcapacity_multiyear_regional = cp.vstack ([totcapacity_multiyear_regional,totcapacity_yearly1])
totcapacity_regional[tech_type] = totcapacity_multiyear_regional
totcapacity[reg] = totcapacity_regional
return totcapacity
def _check(self):
assert hasattr(self.variables, 'totalcapacity'), "totalcapacity must be defined"
def rules(self):
rules = []
totcapacity = transform_total_capacity(
self.variables.totalcapacity,
self.model_data.settings.regions,
self.model_data.settings.timeslice_fraction,
self.model_data.settings.years)
for reg in self.model_data.settings.regions:
for tech_type, value in totcapacity[reg].items():
max_percentage_ramps = self.model_data.regional_parameters[reg]["prod_max_ramp"].loc[:, tech_type]
max_percentage_ramps = max_percentage_ramps.reindex(max_percentage_ramps.index.repeat(
len(self.model_data.settings.timeslice_fraction))
).values
min_percentage_ramps = self.model_data.regional_parameters[reg]["prod_min_ramp"].loc[:, tech_type]
min_percentage_ramps = min_percentage_ramps.reindex(min_percentage_ramps.index.repeat(
len(self.model_data.settings.timeslice_fraction))
).values
energy_prod_differences = self.__get_energy_prod_differences()
for indx, year in enumerate(self.model_data.settings.years):
max_ramp_in_timestep = cp.multiply ( value [indx * len(self.model_data.settings.timeslice_fraction) : (indx + 1)
* len(self.model_data.settings.time_steps), :] , max_percentage_ramps)
min_ramp_in_timestep = cp.multiply ( value [indx * len(self.model_data.settings.timeslice_fraction) : (indx + 1)
* len(self.model_data.settings.time_steps), :] , min_percentage_ramps)
energy_prod = energy_prod_differences[reg][tech_type][indx * len(self.model_data.settings.timeslice_fraction) : (indx + 1)
* len(self.model_data.settings.timeslice_fraction), :]
diff = energy_prod - max_ramp_in_timestep
diff1 = energy_prod * -1 - min_ramp_in_timestep
rules.append(
diff <= 0
)
rules.append(
diff1 <= 0
)
return rules

Related

TypeError: Caught TypeError in DataLoader worker process 0. TypeError: 'KeyError' object is not iterable

from torchvision_starter.engine import train_one_epoch, evaluate
from torchvision_starter import utils
import multiprocessing
import time
n_cpu = multiprocessing.cpu_count()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
_ = model.to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
step_size=3,
gamma=0.2,
verbose=True
)
# Let's train for 10 epochs
num_epochs = 1
start = time.time()
for epoch in range(10, 10 + num_epochs):
# train for one epoch, printing every 10 iterations
train_one_epoch(model, optimizer, data_loaders['train'], device, epoch, print_freq=10)
# update the learning rate
lr_scheduler.step()
# evaluate on the validation dataset
evaluate(model, data_loaders['valid'], device=device)
stop = time.time()
print(f"\n\n{num_epochs} epochs in {stop - start} s ({(stop-start) / 3600:.2f} hrs)")
Before I move on to this part, everything is OK. But after I run the part, the error is like below:
I have tried to add drop_last to the helper.py's function like:
data_loaders["train"] = torch.utils.data.DataLoader(
train_data,
batch_size=batch_size,
sampler=train_sampler,
num_workers=num_workers,
collate_fn=utils.collate_fn,
drop_last=True
)
But it doesn't work. By the way, the torch and torchvision are compatible and Cuda is available.
I wonder how to fix it.
The get_data_loaders function:
def get_data_loaders(
folder, batch_size: int = 2, valid_size: float = 0.2, num_workers: int = -1, limit: int = -1, thinning: int = None
):
"""
Create and returns the train_one_epoch, validation and test data loaders.
:param foder: folder containing the dataset
:param batch_size: size of the mini-batches
:param valid_size: fraction of the dataset to use for validation. For example 0.2
means that 20% of the dataset will be used for validation
:param num_workers: number of workers to use in the data loaders. Use -1 to mean
"use all my cores"
:param limit: maximum number of data points to consider
:param thinning: take every n-th frame, instead of all frames
:return a dictionary with 3 keys: 'train_one_epoch', 'valid' and 'test' containing respectively the
train_one_epoch, validation and test data loaders
"""
if num_workers == -1:
# Use all cores
num_workers = multiprocessing.cpu_count()
# We will fill this up later
data_loaders = {"train": None, "valid": None, "test": None}
# create 3 sets of data transforms: one for the training dataset,
# containing data augmentation, one for the validation dataset
# (without data augmentation) and one for the test set (again
# without augmentation)
data_transforms = {
"train": get_transform(UdacitySelfDrivingDataset.mean, UdacitySelfDrivingDataset.std, train=True),
"valid": get_transform(UdacitySelfDrivingDataset.mean, UdacitySelfDrivingDataset.std, train=False),
"test": get_transform(UdacitySelfDrivingDataset.mean, UdacitySelfDrivingDataset.std, train=False),
}
# Create train and validation datasets
train_data = UdacitySelfDrivingDataset(
folder,
transform=data_transforms["train"],
train=True,
thinning=thinning
)
# The validation dataset is a split from the train_one_epoch dataset, so we read
# from the same folder, but we apply the transforms for validation
valid_data = UdacitySelfDrivingDataset(
folder,
transform=data_transforms["valid"],
train=True,
thinning=thinning
)
# obtain training indices that will be used for validation
n_tot = len(train_data)
indices = torch.randperm(n_tot)
# If requested, limit the number of data points to consider
if limit > 0:
indices = indices[:limit]
n_tot = limit
split = int(math.ceil(valid_size * n_tot))
train_idx, valid_idx = indices[split:], indices[:split]
# define samplers for obtaining training and validation batches
train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
valid_sampler = torch.utils.data.SubsetRandomSampler(valid_idx) # =
# prepare data loaders
data_loaders["train"] = torch.utils.data.DataLoader(
train_data,
batch_size=batch_size,
sampler=train_sampler,
num_workers=num_workers,
collate_fn=utils.collate_fn,
drop_last=True
)
data_loaders["valid"] = torch.utils.data.DataLoader(
valid_data, # -
batch_size=batch_size, # -
sampler=valid_sampler, # -
num_workers=num_workers, # -
collate_fn=utils.collate_fn,
drop_last=True
)
# Now create the test data loader
test_data = UdacitySelfDrivingDataset(
folder,
transform=data_transforms["test"],
train=False,
thinning=thinning
)
if limit > 0:
indices = torch.arange(limit)
test_sampler = torch.utils.data.SubsetRandomSampler(indices)
else:
test_sampler = None
data_loaders["test"] = torch.utils.data.DataLoader(
test_data,
batch_size=batch_size,
shuffle=False,
num_workers=num_workers,
sampler=test_sampler,
collate_fn=utils.collate_fn,
drop_last=True
# -
)
return data_loaders
class UdacitySelfDrivingDataset(torch.utils.data.Dataset):
# Mean and std of the dataset to be used in nn.Normalize
mean = torch.tensor([0.3680, 0.3788, 0.3892])
std = torch.tensor([0.2902, 0.3069, 0.3242])
def __init__(self, root, transform, train=True, thinning=None):
super().__init__()
self.root = os.path.abspath(os.path.expandvars(os.path.expanduser(root)))
self.transform = transform
# load datasets
if train:
self.df = pd.read_csv(os.path.join(self.root, "labels_train.csv"))
else:
self.df = pd.read_csv(os.path.join(self.root, "labels_test.csv"))
# Index by file id (i.e., a sequence of the same length as the number of images)
codes, uniques = pd.factorize(self.df['frame'])
if thinning:
# Take every n-th rows. This makes sense because the images are
# frames of videos from the car, so we are essentially reducing
# the frame rate
thinned = uniques[::thinning]
idx = self.df['frame'].isin(thinned)
print(f"Keeping {thinned.shape[0]} of {uniques.shape[0]} images")
print(f"Keeping {idx.sum()} objects out of {self.df.shape[0]}")
self.df = self.df[idx].reset_index(drop=True)
# Recompute codes
codes, uniques = pd.factorize(self.df['frame'])
self.n_images = len(uniques)
self.df['image_id'] = codes
self.df.set_index("image_id", inplace=True)
self.classes = ['car', 'truck', 'pedestrian', 'bicyclist', 'light']
self.colors = ['cyan', 'blue', 'red', 'purple', 'orange']
#property
def n_classes(self):
return len(self.classes)
def __getitem__(self, idx):
if idx in self.df.index:
row = self.df.loc[[idx]]
else:
return KeyError(f"Element {idx} not in dataframe")
# load images fromm file
img_path = os.path.join(self.root, "images", row['frame'].iloc[0])
img = Image.open(img_path).convert("RGB")
# Exclude bogus boxes with 0 height or width
h = row['ymax'] - row['ymin']
w = row['xmax'] - row['xmin']
filter_idx = (h > 0) & (w > 0)
row = row[filter_idx]
# get bounding box coordinates for each mask
boxes = row[['xmin', 'ymin', 'xmax', 'ymax']].values
# convert everything into a torch.Tensor
boxes = torch.as_tensor(boxes, dtype=torch.float32)
# get the labels
labels = torch.as_tensor(row['class_id'].values, dtype=int)
image_id = torch.tensor([idx])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
# assume no crowd for everything
iscrowd = torch.zeros((row.shape[0],), dtype=torch.int64)
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
if self.transform is not None:
img, target = self.transform(img, target)
return img, target
def __len__(self):
return self.n_images
def plot(self, idx, renormalize=True, predictions=None, threshold=0.5, ax=None):
image, label_js = self[idx]
if renormalize:
# Invert the T.Normalize transform
unnormalize = T.Compose(
[
T.Normalize(mean = [ 0., 0., 0. ], std = 1 / type(self).std),
T.Normalize(mean = -type(self).mean, std = [ 1., 1., 1. ])
]
)
image, label_js = unnormalize(image, label_js)
if ax is None:
fig, ax = plt.subplots(figsize=(8, 8))
_ = ax.imshow(torch.permute(image, [1, 2, 0]))
for i, box in enumerate(label_js['boxes']):
xy = (box[0], box[1])
h, w = (box[2] - box[0]), (box[3] - box[1])
r = patches.Rectangle(xy, h, w, fill=False, color=self.colors[label_js['labels'][i]-1], lw=2, alpha=0.5)
ax.add_patch(r)
if predictions is not None:
# Make sure the predictions are on the CPU
for k in predictions:
predictions[k] = predictions[k].detach().cpu().numpy()
for i, box in enumerate(predictions['boxes']):
if predictions['scores'][i] > threshold:
xy = (box[0], box[1])
h, w = (box[2] - box[0]), (box[3] - box[1])
r = patches.Rectangle(xy, h, w, fill=False, color=self.colors[predictions['labels'][i]-1], lw=2, linestyle=':')
ax.add_patch(r)
_ = ax.axis("off")
return ax

Weighted average cost of short-term stock trading

I'm new to programming and trying to learn Julia. I tried to compute the weighted average cost of short-term stock trading activities as I did before in R. I rewrite the code in Julia, unfortunately, it return the incorrect result in data frame format.
I tried to investigate the result of each iteration step by changing return vwavg to println([volume[i], s, unitprice[i], value[i], t, vwavg[i], u]) and the output is correct. is it a problem with rounding?
Really appreciate your help
# create trial dataset
df = DataFrame(qty = [3, 2, 2, -7, 4, 4, -3,-2, 4, 4, -2, -3],
price = [100.0, 99.0, 101.0, 103.0, 95.0, 93.0, 90.0, 90.0, 93.0, 95.0, 93.0, 92.0])
# create function for weighted average cost of stock price
function vwacost(volume, unitprice)
value = Vector{Float64}(undef, length(volume))
vwavg = Vector{Float64}(undef, length(volume))
for i in 1:length(volume)
s = 0
t = 0
u = 0
if volume[i]>0
value[i] = (volume[i]*unitprice[i]) + t
volume[i] = volume[i] + s
vwavg[i] = value[i]/volume[i]
u = vwavg[i]
s = volume[i]
t = value[i]
else
volume[i] = volume[i] + s
value[i] = u * volume[i]
s = volume[i]
t = value[i]
vwavg[i] = u
end
return vwavg
end
end
out = transform(df, [:qty, :price] => vwacost)
Simple error:
for i in 1:length(volume)
...
return vwavg
end
should be:
for i in 1:length(volume)
...
end
return vwavg
You are currently returning the result after the first loop iteration, which is why your resulting vwawg vector has only one (the first) calculated entry, with all other entries being zero/whatever was in memory when you created the vwawg vector in the first place.
Ok, the second problem of changing original df that result in incorrect result can be solved by copy(df):
out = select(copy(df), [:qty, :price] => vwacost => :avgcost)
thus, the original df will not change and the result will consistent over time.

Calculate RSI indicator according to tradingview?

I would like to calculate RSI 14 in line with the tradingview chart.
According to there wiki this should be the solution:
https://www.tradingview.com/wiki/Talk:Relative_Strength_Index_(RSI)
I implemented this is in a object called RSI:
Calling within object RSI:
self.df['rsi1'] = self.calculate_RSI_method_1(self.df, period=self.period)
Implementation of the code the calculation:
def calculate_RSI_method_1(self, ohlc: pd.DataFrame, period: int = 14) -> pd.Series:
delta = ohlc["close"].diff()
ohlc['up'] = delta.copy()
ohlc['down'] = delta.copy()
ohlc['up'] = pd.to_numeric(ohlc['up'])
ohlc['down'] = pd.to_numeric(ohlc['down'])
ohlc['up'][ohlc['up'] < 0] = 0
ohlc['down'][ohlc['down'] > 0] = 0
# This one below is not correct, but why?
ohlc['_gain'] = ohlc['up'].ewm(com=(period - 1), min_periods=period).mean()
ohlc['_loss'] = ohlc['down'].abs().ewm(com=(period - 1), min_periods=period).mean()
ohlc['RS`'] = ohlc['_gain']/ohlc['_loss']
ohlc['rsi'] = pd.Series(100 - (100 / (1 + ohlc['RS`'])))
self.currentvalue = round(self.df['rsi'].iloc[-1], 8)
print (self.currentvalue)
self.exportspreadsheetfordebugging(ohlc, 'calculate_RSI_method_1', self.symbol)
I tested several other solution like e.g but non return a good value:
https://github.com/peerchemist/finta
https://gist.github.com/jmoz/1f93b264650376131ed65875782df386
Therefore I created a unittest based on :
https://school.stockcharts.com/doku.php?id=technical_indicators:relative_strength_index_rsi
I created an input file: (See excel image below)
and a output file: (See excel image below)
Running the unittest (unittest code not included here) should result in but is only checking the last value.
if result == 37.77295211:
log.info("Unit test 001 - PASSED")
return True
else:
log.error("Unit test 001 - NOT PASSED")
return False
But again I cannot pass the test.
I checked all values by help with excel.
So now i'm a little bit lost.
If I'm following this question:
Calculate RSI indicator from pandas DataFrame?
But this will not give any value in the gain.
a) How should the calculation be in order to align the unittest?
b) How should the calculation be in order to align with tradingview?
Here is a Python implementation of the current RSI indicator version in TradingView:
https://github.com/lukaszbinden/rsi_tradingview/blob/main/rsi.py
I had same issue in calculating RSI and the result was different from TradingView,
I have found RSI Step 2 formula described in InvestoPedia and I changed the code as below:
N = 14
close_price0 = float(klines[0][4])
gain_avg0 = loss_avg0 = close_price0
for kline in klines[1:]:
close_price = float(kline[4])
if close_price > close_price0:
gain = close_price - close_price0
loss = 0
else:
gain = 0
loss = close_price0 - close_price
close_price0 = close_price
gain_avg = (gain_avg0 * (N - 1) + gain) / N
loss_avg = (loss_avg0 * (N - 1) + loss) / N
rsi = 100 - 100 / (1 + gain_avg / loss_avg)
gain_avg0 = gain_avg
loss_avg0 = loss_avg
N is the number of period for calculating RSI (by default = 14)
the code is put in a loop to calculate all RSI values for a series.
For those who are experience the same.
My raw data contained ticks where the volume is zero. Filtering this OLHCV rows will directly give the good results.

Getting a zero value for a variable that clearly should not be zero

I have written the following code to calculate temperature distribution along a fin. For some reason, it keeps calculating certain values as zero, even when they aren't!
for t = 0:300:3000
for i = 2:L2
if i ==2
A(i)= 0.75*rhocp*DX/DT + 3*k/dx;
B(i)= k/dx;
C(i)= 2*k/dx;
D(i)= 0.75*rhocp*(DX/DT)*T(i);
elseif i == L2
A(i)= 0.75*rhocp*DX/DT + 3*k/dx;
B(i)= 2*k/dx;
C(i)= k/dx;
D(i)= 0.75*rhocp*(DX/DT)*T(i);
else
A(i)= 0.75*rhocp*DX/DT + 2*k/dx;
B(i)= k/dx;
C(i)= k/dx;
D(i)= rhocp*(DX/DT)*T(i);
end
P(1) = 0;
Q(1) = T(1);
for i = 2:L2
DENO = A(i) - C(i)*P(i-1);
NUM = D(i)+ C(i)*Q(i-1);
P(i) = B(i)/DENO;
Q(i) = NUM/DENO;
end
for i =L2:-1:2
if i == L2
T(i) = Q(i);
else
T(i) = P(i)*T(i+1) + Q(i);
end
end
T(L1) = ((2*k*T(L2) - h*DX*Tinf)/(2*k - h*DX));
end
disp (T);
`
DENO and NUM is calculated as zero in the first iteration, even though on calculating their values is not zero! This leads to "Division by zero" error.
A(2)-C(2)*P(1)
ans =
3750.
Analytically it has got a value though.
Please give the parameter values so one can run your script...
DT was missing , i set it to 1 .
Finally I found the problem: you forgot an end to close the loop
for i = 2:L2
the end has to be put just before P(1)=0
Moreover this loop does not depend on the t value so it could be put before the loop for t = 0:300:3000

Standard Deviation for SQLite

I've searched the SQLite docs and couldn't find anything, but I've also searched on Google and a few results appeared.
Does SQLite have any built-in Standard Deviation function?
You can calculate the variance in SQL:
create table t (row int);
insert into t values (1),(2),(3);
SELECT AVG((t.row - sub.a) * (t.row - sub.a)) as var from t,
(SELECT AVG(row) AS a FROM t) AS sub;
0.666666666666667
However, you still have to calculate the square root to get the standard deviation.
The aggregate functions supported by SQLite are here:
http://www.sqlite.org/lang_aggfunc.html
STDEV is not in the list.
However, the module extension-functions.c in this page contains a STDEV function.
There is still no built-in stdev function in sqlite. However, you can define (as Alix has done) a user-defined aggregator function. Here is a complete example in Python:
import sqlite3
import math
class StdevFunc:
def __init__(self):
self.M = 0.0
self.S = 0.0
self.k = 1
def step(self, value):
if value is None:
return
tM = self.M
self.M += (value - tM) / self.k
self.S += (value - tM) * (value - self.M)
self.k += 1
def finalize(self):
if self.k < 3:
return None
return math.sqrt(self.S / (self.k-2))
with sqlite3.connect(':memory:') as con:
con.create_aggregate("stdev", 1, StdevFunc)
cur = con.cursor()
cur.execute("create table test(i)")
cur.executemany("insert into test(i) values (?)", [(1,), (2,), (3,), (4,), (5,)])
cur.execute("insert into test(i) values (null)")
cur.execute("select avg(i) from test")
print("avg: %f" % cur.fetchone()[0])
cur.execute("select stdev(i) from test")
print("stdev: %f" % cur.fetchone()[0])
This will print:
avg: 3.000000
stdev: 1.581139
Compare with MySQL: http://sqlfiddle.com/#!2/ad42f3/3/0
Use variance formula V(X) = E(X^2) - E(X)^2. In SQL sqlite
SELECT AVG(col*col) - AVG(col)*AVG(col) FROM table
To get standard deviation you need to take the square root V(X)^(1/2)
I implemented the Welford's method (the same as extension-functions.c) as a SQLite UDF:
$db->sqliteCreateAggregate('stdev',
function (&$context, $row, $data) // step callback
{
if (isset($context) !== true) // $context is null at first
{
$context = array
(
'k' => 0,
'm' => 0,
's' => 0,
);
}
if (isset($data) === true) // the standard is non-NULL values only
{
$context['s'] += ($data - $context['m']) * ($data - ($context['m'] += ($data - $context['m']) / ++$context['k']));
}
return $context;
},
function (&$context, $row) // fini callback
{
if ($context['k'] > 0) // return NULL if no non-NULL values exist
{
return sqrt($context['s'] / $context['k']);
}
return null;
},
1);
That's in PHP ($db is the PDO object) but it should be trivial to port to another language.
SQLite is soooo cool. <3
a little trick
select ((sum(value)*sum(value) - sum(value * value))/((count(*)-1)*(count(*))))
from the_table ;
then the only thing left is to calculate sqrt outside.
No, I searched this same issue, and ended having to do the calculations with my application (PHP)
added some error detection in the python functions
class StdevFunc:
"""
For use as an aggregate function in SQLite
"""
def __init__(self):
self.M = 0.0
self.S = 0.0
self.k = 0
def step(self, value):
try:
# automatically convert text to float, like the rest of SQLite
val = float(value) # if fails, skips this iteration, which also ignores nulls
tM = self.M
self.k += 1
self.M += ((val - tM) / self.k)
self.S += ((val - tM) * (val - self.M))
except:
pass
def finalize(self):
if self.k <= 1: # avoid division by zero
return none
else:
return math.sqrt(self.S / (self.k-1))
You don't state which version of standard deviation you wish to calculate but variances (standard deviation squared) for either version can be calculated using a combination of the sum() and count() aggregate functions.
select
(count(val)*sum(val*val) - (sum(val)*sum(val)))/((count(val)-1)*(count(val))) as sample_variance,
(count(val)*sum(val*val) - (sum(val)*sum(val)))/((count(val))*(count(val))) as population_variance
from ... ;
It will still be necessary to take the square root of these to obtain the standard deviation.
#!/usr/bin/python
# -*- coding: utf-8 -*-
#Values produced by this script can be verified by follwing the steps
#found at https://support.microsoft.com/en-us/kb/213930 to Verify
#by chosing a non memory based database.
import sqlite3
import math
import random
import os
import sys
import traceback
import random
class StdevFunc:
def __init__(self):
self.M = 0.0 #Mean
self.V = 0.0 #Used to Calculate Variance
self.S = 0.0 #Standard Deviation
self.k = 1 #Population or Small
def step(self, value):
try:
if value is None:
return None
tM = self.M
self.M += (value - tM) / self.k
self.V += (value - tM) * (value - self.M)
self.k += 1
except Exception as EXStep:
pass
return None
def finalize(self):
try:
if ((self.k - 1) < 3):
return None
#Now with our range Calculated, and Multiplied finish the Variance Calculation
self.V = (self.V / (self.k-2))
#Standard Deviation is the Square Root of Variance
self.S = math.sqrt(self.V)
return self.S
except Exception as EXFinal:
pass
return None
def Histogram(Population):
try:
BinCount = 6
More = 0
#a = 1 #For testing Trapping
#b = 0 #and Trace Back
#c = (a / b) #with Detailed Info
#If you want to store the Database
#uncDatabase = os.path.join(os.getcwd(),"BellCurve.db3")
#con = sqlite3.connect(uncDatabase)
#If you want the database in Memory
con = sqlite3.connect(':memory:')
#row_factory allows accessing fields by Row and Col Name
con.row_factory = sqlite3.Row
#Add our Non Persistent, Runtime Standard Deviation Function to the Database
con.create_aggregate("Stdev", 1, StdevFunc)
#Lets Grab a Cursor
cur = con.cursor()
#Lets Initialize some tables, so each run with be clear of previous run
cur.executescript('drop table if exists MyData;') #executescript requires ; at the end of the string
cur.execute("create table IF NOT EXISTS MyData('ID' INTEGER PRIMARY KEY AUTOINCREMENT, 'Val' FLOAT)")
cur.executescript('drop table if exists Bins;') #executescript requires ; at the end of the string
cur.execute("create table IF NOT EXISTS Bins('ID' INTEGER PRIMARY KEY AUTOINCREMENT, 'Bin' UNSIGNED INTEGER, 'Val' FLOAT, 'Frequency' UNSIGNED BIG INT)")
#Lets generate some random data, and insert in to the Database
for n in range(0,(Population)):
sql = "insert into MyData(Val) values ({0})".format(random.uniform(-1,1))
#If Whole Number Integer greater that value of 2, Range Greater that 1.5
#sql = "insert into MyData(Val) values ({0})".format(random.randint(-1,1))
cur.execute(sql)
pass
#Now let’s calculate some built in Aggregates, that SQLite comes with
cur.execute("select Avg(Val) from MyData")
Average = cur.fetchone()[0]
cur.execute("select Max(Val) from MyData")
Max = cur.fetchone()[0]
cur.execute("select Min(Val) from MyData")
Min = cur.fetchone()[0]
cur.execute("select Count(Val) from MyData")
Records = cur.fetchone()[0]
#Now let’s get Standard Deviation using our function that we added
cur.execute("select Stdev(Val) from MyData")
Stdev = cur.fetchone()[0]
#And Calculate Range
Range = float(abs(float(Max)-float(Min)))
if (Stdev == None):
print("================================ Data Error ===============================")
print(" Insufficient Population Size, Or Bad Data.")
print("*****************************************************************************")
elif (abs(Max-Min) == 0):
print("================================ Data Error ===============================")
print(" The entire Population Contains Identical values, Distribution Incalculable.")
print("******************************************************************************")
else:
Bin = [] #Holds the Bin Values
Frequency = [] #Holds the Bin Frequency for each Bin
#Establish the 1st Bin, which is based on (Standard Deviation * 3) being subtracted from the Mean
Bin.append(float((Average - ((3 * Stdev)))))
Frequency.append(0)
#Establish the remaining Bins, which is basically adding 1 Standard Deviation
#for each interation, -3, -2, -1, 1, 2, 3
for b in range(0,(BinCount) + 1):
Bin.append((float(Bin[(b)]) + Stdev))
Frequency.append(0)
for b in range(0,(BinCount) + 1):
#Lets exploit the Database and have it do the hard work calculating distribution
#of all the Bins, with SQL's between operator, but making it left inclusive, right exclusive.
sqlBinFreq = "select count(*) as Frequency from MyData where val between {0} and {1} and Val < {2}". \
format(float((Bin[b])), float(Bin[(b + 1)]), float(Bin[(b + 1)]))
#If the Database Reports Values that fall between the Current Bin, Store the Frequency to a Bins Table.
for rowBinFreq in cur.execute(sqlBinFreq):
Frequency[(b + 1)] = rowBinFreq['Frequency']
sqlBinFreqInsert = "insert into Bins (Bin, Val, Frequency) values ({0}, {1}, {2})". \
format(b, float(Bin[b]), Frequency[(b)])
cur.execute(sqlBinFreqInsert)
#Allthough this Demo is not likley produce values that
#fall outside of Standard Distribution
#if this demo was to Calculate with real data, we want to know
#how many non-Standard data points we have.
More = (More + Frequency[b])
More = abs((Records - More))
#Add the More value
sqlBinFreqInsert = "insert into Bins (Bin, Val, Frequency) values ({0}, {1}, {2})". \
format((BinCount + 1), float(0), More)
cur.execute(sqlBinFreqInsert)
#Now Report the Analysis
print("================================ The Population ==============================")
print(" {0} {1} {2} {3} {4} {5}". \
format("Size".rjust(10, ' '), \
"Max".rjust(10, ' '), \
"Min".rjust(10, ' '), \
"Mean".rjust(10, ' '), \
"Range".rjust(10, ' '), \
"Stdev".rjust(10, ' ')))
print("Aggregates: {0:10d} {1:10.4f} {2:10.4f} {3:10.4f} {4:10.4f} {5:10.4f}". \
format(Population, Max, Min, Average, Range, Stdev))
print("================================= The Bell Curve =============================")
LabelString = "{0} {1} {2} {3}". \
format("Bin".ljust(8, ' '), \
"Ranges".rjust(8, ' '), \
"Frequency".rjust(8, ' '), \
"Histogram".rjust(6, ' '))
print(LabelString)
print("------------------------------------------------------------------------------")
#Let's Paint a Histogram
sqlChart = "select * from Bins order by Bin asc"
for rowChart in cur.execute(sqlChart):
if (rowChart['Bin'] == 7):
#Bin 7 is not really a bin, but where we place the values that did not fit into the
#Normal Distribution. This script was tested against Excel's Bell Curve Example
#https://support.microsoft.com/en-us/kb/213930
#and produces the same results. Feel free to test it.
BinName = "More"
ChartString = "{0:<6} {1:<10} {2:10.0f}". \
format(BinName, \
"", \
More)
else:
#Theses are the actual bins where values fall within the distribution.
BinName = (rowChart['Bin'] + 1)
#Scale the Chart
fPercent = ((float(rowChart['Frequency']) / float(Records) * 100))
iPrecent = int(math.ceil(fPercent))
ChartString = "{0:<6} {1:10.4f} {2:10.0f} {3}". \
format(BinName, \
rowChart['Val'], \
rowChart['Frequency'], \
"".rjust(iPrecent, '#'))
print(ChartString)
print("******************************************************************************")
#Commit to Database
con.commit()
#Clean Up
cur.close()
con.close()
except Exception as EXBellCurve:
pass
TraceInfo = traceback.format_exc()
raise Exception(TraceInfo)

Resources