Running R in Scala using rscala causing issues - r

I recently came across an R package called rscala. https://darrenjw.wordpress.com/tag/rscala/
I tried executing the example however the program never completed running. I am not sure what may be wrong. Whenever I try to instantiate RClient, it seems to run forever. Please help.

For me the following code runs:
import breeze.stats.distributions._
import breeze.linalg._
import org.ddahl.rscala.RClient
object ScalaToRTest {
def main(args: Array[String]): Unit = {
// first simulate some data consistent with a Poisson regression model
val x = Uniform(50,60).sample(1000)
val eta = x map { xi => (xi * 0.1) - 3 }
val mu = eta map { math.exp }
val y = mu map { Poisson(_).draw }
// call to R to fit the Poission regression model
val R = RClient() // initialise an R interpreter
R.x=x.toArray // send x to R
R.y=y.toArray // send y to R
R.eval("mod <- glm(y~x,family=poisson())") // fit the model in R
// pull the fitted coefficents back into scala
val beta = DenseVector[Double](R.evalD1("mod$coefficients"))
// print the fitted coefficents
println(beta)
}
}
Output:
DenseVector(-3.1683714618415855, 0.1031332817387318)
build.sbt
name := "scalaRdemo"
version := "0.1"
scalaVersion := "2.12.3"
scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature")
resolvers ++= Seq(
"Sonatype Snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/",
"Sonatype Releases" at "https://oss.sonatype.org/content/repositories/releases/"
)
libraryDependencies += "org.scalanlp" %% "breeze-natives" % "0.13.2"
libraryDependencies += "org.scalanlp" %% "breeze" % "0.13.2"
libraryDependencies += "org.ddahl" %% "rscala" % "2.3.5"

Related

how to avoid this error (Error : display Surface quit ) when rendering open-AIgym?

I am trying to solve the mountain car problem in AI gym, but when I use env. render()it works the first time, but when I try to render the simulation again after 2000 runs it gives the below error ( error: display Surface quit). How can I avoid this error?
I am using windows, and I am running the code on a jupyter notebook.
import gym
import numpy as np
import sys
#Create gym environment.
discount = 0.95
Learning_rate = 0.01
episodes = 25000
SHOW_EVERY = 2000
env = gym.make('MountainCar-v0')
discrete_os_size = [20] *len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/ discrete_os_size
q_table = np.random.uniform(low=-2, high=0, size=(discrete_os_size + [env.action_space.n]))
# convert continuous state to discrete state
def get_discrete_state(state):
discrete_State = (state - env.observation_space.low) / discrete_os_win_size
return tuple(discrete_State.astype(int))
for episode in range(episodes):
if episode % SHOW_EVERY == 0:
render = True
print(episode)
else:
render = False
ds = get_discrete_state(env.reset())
done = False
while not done:
action = np.argmax(q_table[ds])
new_state, reward, done, _ = env.step(action)
new_discrete_state = get_discrete_state(new_state)
if episode % SHOW_EVERY == 0:
env.render()
if not done:
max_future_q = np.max(q_table[new_discrete_state])
current_q_value = q_table[ds + (action, )]
new_q = (1-Learning_rate) * current_q_value + Learning_rate * (reward +
discount * max_future_q )
q_table[ds + (action, )] = new_q
elif new_state[0] >= env.goal_position:
q_table[ds + (action, )] = 0
ds = new_discrete_state
env.close()
I faced the same problem, cuz when you call env.close() it closes the environment so in order run it again you have to make a new environment. Just comment env.close() if you want to run the same environment again.

Optimizing Distributed I/O with serial output

I am having trouble understanding how to optimize a distributed component with a serial output. This is my attempt with an example problem given in the openmdao docs.
import numpy as np
import openmdao.api as om
from openmdao.utils.array_utils import evenly_distrib_idxs
from openmdao.utils.mpi import MPI
class MixedDistrib2(om.ExplicitComponent):
def setup(self):
# Distributed Input
self.add_input('in_dist', shape_by_conn=True, distributed=True)
# Serial Input
self.add_input('in_serial', val=1)
# Distributed Output
self.add_output('out_dist', copy_shape='in_dist', distributed=True)
# Serial Output
self.add_output('out_serial', copy_shape='in_serial')
#self.declare_partials('*','*', method='cs')
def compute(self, inputs, outputs):
x = inputs['in_dist']
y = inputs['in_serial']
# "Computationally Intensive" operation that we wish to parallelize.
f_x = x**2 - 2.0*x + 4.0
# These operations are repeated on all procs.
f_y = y ** 0.5
g_y = y**2 + 3.0*y - 5.0
# Compute square root of our portion of the distributed input.
g_x = x ** 0.5
# Distributed output
outputs['out_dist'] = f_x + f_y
# Serial output
if MPI and comm.size > 1:
# We need to gather the summed values to compute the total sum over all procs.
local_sum = np.array(np.sum(g_x))
total_sum = local_sum.copy()
self.comm.Allreduce(local_sum, total_sum, op=MPI.SUM)
outputs['out_serial'] = g_y * total_sum
else:
# Recommended to make sure your code can run in serial too, for testing.
outputs['out_serial'] = g_y * np.sum(g_x)
size = 7
if MPI:
comm = MPI.COMM_WORLD
rank = comm.rank
sizes, offsets = evenly_distrib_idxs(comm.size, size)
else:
# When running in serial, the entire variable is on rank 0.
rank = 0
sizes = {rank : size}
offsets = {rank : 0}
prob = om.Problem()
model = prob.model
# Create a distributed source for the distributed input.
ivc = om.IndepVarComp()
ivc.add_output('x_dist', np.zeros(sizes[rank]), distributed=True)
ivc.add_output('x_serial', val=1)
model.add_subsystem("indep", ivc)
model.add_subsystem("D1", MixedDistrib2())
model.add_subsystem('con_cmp1', om.ExecComp('con1 = y**2'), promotes=['con1', 'y'])
model.connect('indep.x_dist', 'D1.in_dist')
model.connect('indep.x_serial', ['D1.in_serial','y'])
prob.driver = om.ScipyOptimizeDriver()
prob.driver.options['optimizer'] = 'SLSQP'
model.add_design_var('indep.x_serial', lower=5, upper=10)
model.add_constraint('con1', upper=90)
model.add_objective('D1.out_serial')
prob.setup(force_alloc_complex=True)
#prob.setup()
# Set initial values of distributed variable.
x_dist_init = [1,1,1,1,1,1,1]
prob.set_val('indep.x_dist', x_dist_init)
# Set initial values of serial variable.
prob.set_val('indep.x_serial', 10)
#prob.run_model()
prob.run_driver()
print('x_dist', prob.get_val('indep.x_dist', get_remote=True))
print('x_serial', prob.get_val('indep.x_serial'))
print('Obj', prob.get_val('D1.out_serial'))
The problem is with defining partials with 'fd' or 'cs'. I cannot define partials of serial output w.r.t distributed input. So I used prob.setup(force_alloc_complex=True) to use complex step. But gives me this warning DerivativesWarning:Constraints or objectives [('D1.out_serial', inds=[0])] cannot be impacted by the design variables of the problem. I understand this is because the total derivative is 0 which causes the warning but I dont understand the reason. Clearly the total derivative should not be 0 here. But I guess this is because I didn't explicitly declare_partials in the component. I tried removing the distributed components and ran it again with declare_partials and this works correctly(code below).
import numpy as np
import openmdao.api as om
class MixedDistrib2(om.ExplicitComponent):
def setup(self):
self.add_input('in_dist', np.zeros(7))
self.add_input('in_serial', val=1)
self.add_output('out_serial', val=0)
self.declare_partials('*','*', method='cs')
def compute(self, inputs, outputs):
x = inputs['in_dist']
y = inputs['in_serial']
g_y = y**2 + 3.0*y - 5.0
g_x = x ** 0.5
outputs['out_serial'] = g_y * np.sum(g_x)
prob = om.Problem()
model = prob.model
model.add_subsystem("D1", MixedDistrib2(), promotes_inputs=['in_dist', 'in_serial'], promotes_outputs=['out_serial'])
model.add_subsystem('con_cmp1', om.ExecComp('con1 = in_serial**2'), promotes=['con1', 'in_serial'])
prob.driver = om.ScipyOptimizeDriver()
prob.driver.options['optimizer'] = 'SLSQP'
model.add_design_var('in_serial', lower=5, upper=10)
model.add_constraint('con1', upper=90)
model.add_objective('out_serial')
prob.setup(force_alloc_complex=True)
prob.set_val('in_dist', [1,1,1,1,1,1,1])
prob.set_val('in_serial', 10)
prob.run_model()
prob.check_totals()
prob.run_driver()
print('x_dist', prob.get_val('in_dist', get_remote=True))
print('x_serial', prob.get_val('in_serial'))
print('Obj', prob.get_val('out_serial'))
What I am trying to understand is
How to use 'fd' or 'cs' in Distributed component with a serial output?
What is the meaning of prob.setup(force_alloc_complex=True) ? Is not forcing to use cs in all the components in the problem ? If so why does the total derivative becomes 0?
When I run your code in OpenMDAO V 3.11.0 (after uncommenting the declare_partials call) I get the following error:
RuntimeError: 'D1' <class MixedDistrib2>: component has defined partial ('out_serial', 'in_dist') which is a serial output wrt a distributed input. This is only supported using the matrix free API.
As the error indicates, you can't use the matrix-based api for derivatives in this situations. The reasons why are a bit subtle, and probably outside the scope of what needs to be delt with to answer your question here. It boils down to OpenMDAO not knowing why kind of distributed operations are being done in the compute and having no way to manage those details when you propagate things in reverse.
So you need to use the matrix-free derivative APIs in this situation. When you use the matrix-free APIs you DO NOT declare any partials, because you don't want OpenMDAO to allocate any memory for you to store partials in (and you wouldn't use that memory even if it did).
I've coded them for your example here, but I need to note a few important details:
Your example has a distributed IVC, but as of OpenMDAO V3.11.0 you can't get total derivatives with respect to distributed design variables. I assume you just made it that way to make your simple test case, but in case your real problem was set up this way, you need to note this and not do it this way. Instead, make the IVC serial, and use src indices to distribute the correct parts to each proc.
In the example below, the derivatives are correct. However, there seems to be a bug in the check_partials output when running in paralle. So the reverse mode partials look like they are off by a factor of the comm size... this will have to get fixed in later releases.
I only did the derivatives for out_serial. out_dist will work similarly and is left as an excersize for the reader :)
You'll notice that I duplicates some code in the compute and compute_jacvec_product methods. You can abstract this duplicate code out into its own method (or call compute from within compute_jacvec_product by providing your own output dictionary). However, you might be asking why the duplicate call is needed at all? Why can't u store the values from the compute call. The answer is, in large part, that OpenMDAO does not guarantee that compute is always called before compute_jacvec_product. However, I'll also point out that this kind of code duplication is very AD-like. Any AD code will have the same kind of duplication built in, even though you don't see it.
import numpy as np
import openmdao.api as om
from openmdao.utils.array_utils import evenly_distrib_idxs
from openmdao.utils.mpi import MPI
class MixedDistrib2(om.ExplicitComponent):
def setup(self):
# Distributed Input
self.add_input('in_dist', shape_by_conn=True, distributed=True)
# Serial Input
self.add_input('in_serial', val=1)
# Distributed Output
self.add_output('out_dist', copy_shape='in_dist', distributed=True)
# Serial Output
self.add_output('out_serial', copy_shape='in_serial')
# self.declare_partials('*','*', method='fd')
def compute(self, inputs, outputs):
x = inputs['in_dist']
y = inputs['in_serial']
# "Computationally Intensive" operation that we wish to parallelize.
f_x = x**2 - 2.0*x + 4.0
# These operations are repeated on all procs.
f_y = y ** 0.5
g_y = y**2 + 3.0*y - 5.0
# Compute square root of our portion of the distributed input.
g_x = x ** 0.5
# Distributed output
outputs['out_dist'] = f_x + f_y
# Serial output
if MPI and comm.size > 1:
# We need to gather the summed values to compute the total sum over all procs.
local_sum = np.array(np.sum(g_x))
total_sum = local_sum.copy()
self.comm.Allreduce(local_sum, total_sum, op=MPI.SUM)
outputs['out_serial'] = g_y * total_sum
else:
# Recommended to make sure your code can run in serial too, for testing.
outputs['out_serial'] = g_y * np.sum(g_x)
def compute_jacvec_product(self, inputs, d_inputs, d_outputs, mode):
x = inputs['in_dist']
y = inputs['in_serial']
g_y = y**2 + 3.0*y - 5.0
# "Computationally Intensive" operation that we wish to parallelize.
f_x = x**2 - 2.0*x + 4.0
# These operations are repeated on all procs.
f_y = y ** 0.5
g_y = y**2 + 3.0*y - 5.0
# Compute square root of our portion of the distributed input.
g_x = x ** 0.5
# Distributed output
out_dist = f_x + f_y
# Serial output
if MPI and comm.size > 1:
# We need to gather the summed values to compute the total sum over all procs.
local_sum = np.array(np.sum(g_x))
total_sum = local_sum.copy()
self.comm.Allreduce(local_sum, total_sum, op=MPI.SUM)
# total_sum
else:
# Recommended to make sure your code can run in serial too, for testing.
total_sum = np.sum(g_x)
num_x = len(x)
d_f_x__d_x = np.diag(2*x - 2.)
d_f_y__d_y = np.ones(num_x)*0.5*y**-0.5
d_g_y__d_y = 2*y + 3.
d_g_x__d_x = 0.5*x**-0.5
d_out_dist__d_x = d_f_x__d_x # square matrix
d_out_dist__d_y = d_f_y__d_y # num_x,1
d_out_serial__d_y = d_g_y__d_y # scalar
d_out_serial__d_x = g_y*d_g_x__d_x.reshape((1,num_x))
if mode == 'fwd':
if 'out_serial' in d_outputs:
if 'in_dist' in d_inputs:
d_outputs['out_serial'] += d_out_serial__d_x.dot(d_inputs['in_dist'])
if 'in_serial' in d_inputs:
d_outputs['out_serial'] += d_out_serial__d_y.dot(d_inputs['in_serial'])
elif mode == 'rev':
if 'out_serial' in d_outputs:
if 'in_dist' in d_inputs:
d_inputs['in_dist'] += d_out_serial__d_x.T.dot(d_outputs['out_serial'])
if 'in_serial' in d_inputs:
d_inputs['in_serial'] += total_sum*d_out_serial__d_y.T.dot(d_outputs['out_serial'])
size = 7
if MPI:
comm = MPI.COMM_WORLD
rank = comm.rank
sizes, offsets = evenly_distrib_idxs(comm.size, size)
else:
# When running in serial, the entire variable is on rank 0.
rank = 0
sizes = {rank : size}
offsets = {rank : 0}
prob = om.Problem()
model = prob.model
# Create a distributed source for the distributed input.
ivc = om.IndepVarComp()
ivc.add_output('x_dist', np.zeros(sizes[rank]), distributed=True)
ivc.add_output('x_serial', val=1)
model.add_subsystem("indep", ivc)
model.add_subsystem("D1", MixedDistrib2())
model.add_subsystem('con_cmp1', om.ExecComp('con1 = y**2'), promotes=['con1', 'y'])
model.connect('indep.x_dist', 'D1.in_dist')
model.connect('indep.x_serial', ['D1.in_serial','y'])
prob.driver = om.ScipyOptimizeDriver()
prob.driver.options['optimizer'] = 'SLSQP'
model.add_design_var('indep.x_serial', lower=5, upper=10)
model.add_constraint('con1', upper=90)
model.add_objective('D1.out_serial')
prob.setup(force_alloc_complex=True)
#prob.setup()
# Set initial values of distributed variable.
x_dist_init = np.ones(sizes[rank])
prob.set_val('indep.x_dist', x_dist_init)
# Set initial values of serial variable.
prob.set_val('indep.x_serial', 10)
prob.run_model()
prob.check_partials()
# prob.run_driver()
print('x_dist', prob.get_val('indep.x_dist', get_remote=True))
print('x_serial', prob.get_val('indep.x_serial'))
print('Obj', prob.get_val('D1.out_serial'))

Tensorflow: 6 layer CNN: OOM (use 10Gb GPU memory)

I am using the following code for running a 6 layer CNN with 2 FC layers on top (on Tesla K-80 GPU).
Somehow, it consumes entire memory 10GB and died out of memory.I know that i can reduce the batch_size and then run , but i also want to run with 15 or 20 CNN layers.Whats wrong with the following code and why it takes all the memory? How should i run the code for 15 layers CNN.
Code:
import model
with tf.Graph().as_default() as g_train:
filenames = tf.train.match_filenames_once(FLAGS.train_dir+'*.tfrecords')
filename_queue = tf.train.string_input_producer(filenames, shuffle=True, num_epochs=FLAGS.num_epochs)
feats,labels = get_batch_input(filename_queue, batch_size=FLAGS.batch_size)
### feats size=(batch_size, 100, 50)
logits = model.inference(feats, FLAGS.batch_size)
loss = model.loss(logits, labels, feats)
tvars = tf.trainable_variables()
global_step = tf.Variable(0, name='global_step', trainable=False)
# Add to the Graph operations that train the model.
train_op = model.training(loss, tvars, global_step, FLAGS.learning_rate, FLAGS.clip_gradients)
# Add the Op to compare the logits to the labels during evaluation.
eval_correct = model.evaluation(logits, labels, feats)
summary_op = tf.merge_all_summaries()
saver = tf.train.Saver(tf.all_variables(), max_to_keep=15)
# The op for initializing the variables.
init_op = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init_op)
summary_writer = tf.train.SummaryWriter(FLAGS.model_dir,
graph=sess.graph)
# Start input enqueue threads.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
try:
step = 0
while not coord.should_stop():
_, loss_value = sess.run([train_op, loss])
if step % 100 == 0:
print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value))
# Update the events file.
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
if (step == 0) or (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
ckpt_model = os.path.join(FLAGS.model_dir, 'model.ckpt')
saver.save(sess, ckpt_model, global_step=step)
#saver.save(sess, FLAGS.model_dir, global_step=step)
step += 1
except tf.errors.OutOfRangeError:
print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
finally:
coord.join(threads)
sess.close()
###################### File model.py ####################
def conv2d(x, W, b, strides=1):
# Conv2D wrapper, with bias and relu activation
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1],
padding='SAME')
x = tf.nn.bias_add(x, b)
return tf.nn.relu(x)
def maxpool2d(x, k=2,s=2):
# MaxPool2D wrapper
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, s,
s,1],padding='SAME')
def inference(feats,batch_size):
#feats size (batch_size,100,50,1) #batch_size=256
conv1_w=tf.get_variable("conv1_w", [filter_size,filter_size,1,256],initializer=tf.uniform_unit_scaling_initializer())
conv1_b=tf.get_variable("conv1_b",[256])
conv1 = conv2d(feats, conv1_w, conv1_b,2)
conv1 = maxpool2d(conv1, k=2,s=2)
### This was replicated for 6 layers and the 2 FC connected layers are added
return logits
def training(loss, train_vars, global_step, learning_rate, clip_gradients):
# Add a scalar summary for the snapshot loss.
tf.scalar_summary(loss.op.name, loss)
grads, _ = tf.clip_by_global_norm(tf.gradients(loss, train_vars,aggregation_method=1), clip_gradients)
optimizer = tf.train.AdamOptimizer(learning_rate)
train_op = optimizer.apply_gradients(zip(grads, train_vars), global_step=global_step)
return train_op
I am not too sure what the model python library is. If it is something you wrote and can change the setting in the optimizer I would suggest the following which I use in my own code
train_step = tf.train.AdamOptimizer(learning_rate).minimize(cost, aggregation_method = tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
By default the aggeragetion_method is ADD_N but if you change it to EXPERIMENTAL_ACCUMULATE_N or EXPERIMENTAL_TREE this will greatly save memory. The main memory hog in these programs is that tensorflow must save the output values at every neuron so that it can compute the gradients. Changing the aggregation_method helps a lot from my experience.
Also BTW I don't think there is anything wrong with your code. I can run out of memory on small cov-nets as well.

Evaluation Metrics for Binary Classification in Spark: AUC and PR curve

I was trying to calculate Precision, Recall by Threshold for LogisticRegressionwithLBFGS using BinaryclassificationMetrics.
I got all those. I was trying to figure out if I could get a graphical output of PR and AUC curve.
Pasting my Codes below:
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object log_reg_eval_metric {
def main(args: Array[String]): Unit = {
System.setProperty("hadoop.home.dir", "c:\\winutil\\")
val sc = new SparkContext(new SparkConf().setAppName("SparkTest").setMaster("local[*]"))
val sqlContext = new org.apache.spark.sql.SQLContext(sc);
val data: RDD[String] = sc.textFile("C:/Users/user/Documents/spark-1.5.1-bin-hadoop2.4/data/mllib/credit_approval_2_attr.csv")
val parsedData = data.map { line =>
val parts = line.split(',').map(_.toDouble)
LabeledPoint(parts(0), Vectors.dense(parts.tail))
}
//Splitting the data
val splits: Array[RDD[LabeledPoint]] = parsedData.randomSplit(Array(0.7, 0.3), seed = 11L)
val training: RDD[LabeledPoint] = splits(0).cache()
val test: RDD[LabeledPoint] = splits(1)
// Run training algorithm to build the model
val model = new LogisticRegressionWithLBFGS()
.setNumClasses(2)
.run(training)
// Clear the prediction threshold so the model will return probabilities
model.clearThreshold
// Compute raw scores on the test set
val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
val prediction = model.predict(features)
(prediction, label)
}
// Instantiate metrics object
val metrics = new BinaryClassificationMetrics(predictionAndLabels)
// Precision by threshold
val precision = metrics.precisionByThreshold
precision.foreach { case (t, p) =>
println(s"Threshold: $t, Precision: $p")
}
// Precision-Recall Curve
val PRC = metrics.pr
print(PRC)
}
}
output from print(PRC):
UnionRDD[39] at union at BinaryClassificationMetrics.scala:108
I am not sure what is an union RDD and how to use it. Is there any other way to get the graphical output. Doing my research on it. Any suggestion would be great.
You can use BinaryLogisticRegressionTrainingSummary from spark.ml package.It provides PR and ROC values out of box as dataframes.
You can input these values to any rendering utility to see the specific curves.(Any multiline plot with x and y values will display the curves.)

Standard Deviation for SQLite

I've searched the SQLite docs and couldn't find anything, but I've also searched on Google and a few results appeared.
Does SQLite have any built-in Standard Deviation function?
You can calculate the variance in SQL:
create table t (row int);
insert into t values (1),(2),(3);
SELECT AVG((t.row - sub.a) * (t.row - sub.a)) as var from t,
(SELECT AVG(row) AS a FROM t) AS sub;
0.666666666666667
However, you still have to calculate the square root to get the standard deviation.
The aggregate functions supported by SQLite are here:
http://www.sqlite.org/lang_aggfunc.html
STDEV is not in the list.
However, the module extension-functions.c in this page contains a STDEV function.
There is still no built-in stdev function in sqlite. However, you can define (as Alix has done) a user-defined aggregator function. Here is a complete example in Python:
import sqlite3
import math
class StdevFunc:
def __init__(self):
self.M = 0.0
self.S = 0.0
self.k = 1
def step(self, value):
if value is None:
return
tM = self.M
self.M += (value - tM) / self.k
self.S += (value - tM) * (value - self.M)
self.k += 1
def finalize(self):
if self.k < 3:
return None
return math.sqrt(self.S / (self.k-2))
with sqlite3.connect(':memory:') as con:
con.create_aggregate("stdev", 1, StdevFunc)
cur = con.cursor()
cur.execute("create table test(i)")
cur.executemany("insert into test(i) values (?)", [(1,), (2,), (3,), (4,), (5,)])
cur.execute("insert into test(i) values (null)")
cur.execute("select avg(i) from test")
print("avg: %f" % cur.fetchone()[0])
cur.execute("select stdev(i) from test")
print("stdev: %f" % cur.fetchone()[0])
This will print:
avg: 3.000000
stdev: 1.581139
Compare with MySQL: http://sqlfiddle.com/#!2/ad42f3/3/0
Use variance formula V(X) = E(X^2) - E(X)^2. In SQL sqlite
SELECT AVG(col*col) - AVG(col)*AVG(col) FROM table
To get standard deviation you need to take the square root V(X)^(1/2)
I implemented the Welford's method (the same as extension-functions.c) as a SQLite UDF:
$db->sqliteCreateAggregate('stdev',
function (&$context, $row, $data) // step callback
{
if (isset($context) !== true) // $context is null at first
{
$context = array
(
'k' => 0,
'm' => 0,
's' => 0,
);
}
if (isset($data) === true) // the standard is non-NULL values only
{
$context['s'] += ($data - $context['m']) * ($data - ($context['m'] += ($data - $context['m']) / ++$context['k']));
}
return $context;
},
function (&$context, $row) // fini callback
{
if ($context['k'] > 0) // return NULL if no non-NULL values exist
{
return sqrt($context['s'] / $context['k']);
}
return null;
},
1);
That's in PHP ($db is the PDO object) but it should be trivial to port to another language.
SQLite is soooo cool. <3
a little trick
select ((sum(value)*sum(value) - sum(value * value))/((count(*)-1)*(count(*))))
from the_table ;
then the only thing left is to calculate sqrt outside.
No, I searched this same issue, and ended having to do the calculations with my application (PHP)
added some error detection in the python functions
class StdevFunc:
"""
For use as an aggregate function in SQLite
"""
def __init__(self):
self.M = 0.0
self.S = 0.0
self.k = 0
def step(self, value):
try:
# automatically convert text to float, like the rest of SQLite
val = float(value) # if fails, skips this iteration, which also ignores nulls
tM = self.M
self.k += 1
self.M += ((val - tM) / self.k)
self.S += ((val - tM) * (val - self.M))
except:
pass
def finalize(self):
if self.k <= 1: # avoid division by zero
return none
else:
return math.sqrt(self.S / (self.k-1))
You don't state which version of standard deviation you wish to calculate but variances (standard deviation squared) for either version can be calculated using a combination of the sum() and count() aggregate functions.
select
(count(val)*sum(val*val) - (sum(val)*sum(val)))/((count(val)-1)*(count(val))) as sample_variance,
(count(val)*sum(val*val) - (sum(val)*sum(val)))/((count(val))*(count(val))) as population_variance
from ... ;
It will still be necessary to take the square root of these to obtain the standard deviation.
#!/usr/bin/python
# -*- coding: utf-8 -*-
#Values produced by this script can be verified by follwing the steps
#found at https://support.microsoft.com/en-us/kb/213930 to Verify
#by chosing a non memory based database.
import sqlite3
import math
import random
import os
import sys
import traceback
import random
class StdevFunc:
def __init__(self):
self.M = 0.0 #Mean
self.V = 0.0 #Used to Calculate Variance
self.S = 0.0 #Standard Deviation
self.k = 1 #Population or Small
def step(self, value):
try:
if value is None:
return None
tM = self.M
self.M += (value - tM) / self.k
self.V += (value - tM) * (value - self.M)
self.k += 1
except Exception as EXStep:
pass
return None
def finalize(self):
try:
if ((self.k - 1) < 3):
return None
#Now with our range Calculated, and Multiplied finish the Variance Calculation
self.V = (self.V / (self.k-2))
#Standard Deviation is the Square Root of Variance
self.S = math.sqrt(self.V)
return self.S
except Exception as EXFinal:
pass
return None
def Histogram(Population):
try:
BinCount = 6
More = 0
#a = 1 #For testing Trapping
#b = 0 #and Trace Back
#c = (a / b) #with Detailed Info
#If you want to store the Database
#uncDatabase = os.path.join(os.getcwd(),"BellCurve.db3")
#con = sqlite3.connect(uncDatabase)
#If you want the database in Memory
con = sqlite3.connect(':memory:')
#row_factory allows accessing fields by Row and Col Name
con.row_factory = sqlite3.Row
#Add our Non Persistent, Runtime Standard Deviation Function to the Database
con.create_aggregate("Stdev", 1, StdevFunc)
#Lets Grab a Cursor
cur = con.cursor()
#Lets Initialize some tables, so each run with be clear of previous run
cur.executescript('drop table if exists MyData;') #executescript requires ; at the end of the string
cur.execute("create table IF NOT EXISTS MyData('ID' INTEGER PRIMARY KEY AUTOINCREMENT, 'Val' FLOAT)")
cur.executescript('drop table if exists Bins;') #executescript requires ; at the end of the string
cur.execute("create table IF NOT EXISTS Bins('ID' INTEGER PRIMARY KEY AUTOINCREMENT, 'Bin' UNSIGNED INTEGER, 'Val' FLOAT, 'Frequency' UNSIGNED BIG INT)")
#Lets generate some random data, and insert in to the Database
for n in range(0,(Population)):
sql = "insert into MyData(Val) values ({0})".format(random.uniform(-1,1))
#If Whole Number Integer greater that value of 2, Range Greater that 1.5
#sql = "insert into MyData(Val) values ({0})".format(random.randint(-1,1))
cur.execute(sql)
pass
#Now let’s calculate some built in Aggregates, that SQLite comes with
cur.execute("select Avg(Val) from MyData")
Average = cur.fetchone()[0]
cur.execute("select Max(Val) from MyData")
Max = cur.fetchone()[0]
cur.execute("select Min(Val) from MyData")
Min = cur.fetchone()[0]
cur.execute("select Count(Val) from MyData")
Records = cur.fetchone()[0]
#Now let’s get Standard Deviation using our function that we added
cur.execute("select Stdev(Val) from MyData")
Stdev = cur.fetchone()[0]
#And Calculate Range
Range = float(abs(float(Max)-float(Min)))
if (Stdev == None):
print("================================ Data Error ===============================")
print(" Insufficient Population Size, Or Bad Data.")
print("*****************************************************************************")
elif (abs(Max-Min) == 0):
print("================================ Data Error ===============================")
print(" The entire Population Contains Identical values, Distribution Incalculable.")
print("******************************************************************************")
else:
Bin = [] #Holds the Bin Values
Frequency = [] #Holds the Bin Frequency for each Bin
#Establish the 1st Bin, which is based on (Standard Deviation * 3) being subtracted from the Mean
Bin.append(float((Average - ((3 * Stdev)))))
Frequency.append(0)
#Establish the remaining Bins, which is basically adding 1 Standard Deviation
#for each interation, -3, -2, -1, 1, 2, 3
for b in range(0,(BinCount) + 1):
Bin.append((float(Bin[(b)]) + Stdev))
Frequency.append(0)
for b in range(0,(BinCount) + 1):
#Lets exploit the Database and have it do the hard work calculating distribution
#of all the Bins, with SQL's between operator, but making it left inclusive, right exclusive.
sqlBinFreq = "select count(*) as Frequency from MyData where val between {0} and {1} and Val < {2}". \
format(float((Bin[b])), float(Bin[(b + 1)]), float(Bin[(b + 1)]))
#If the Database Reports Values that fall between the Current Bin, Store the Frequency to a Bins Table.
for rowBinFreq in cur.execute(sqlBinFreq):
Frequency[(b + 1)] = rowBinFreq['Frequency']
sqlBinFreqInsert = "insert into Bins (Bin, Val, Frequency) values ({0}, {1}, {2})". \
format(b, float(Bin[b]), Frequency[(b)])
cur.execute(sqlBinFreqInsert)
#Allthough this Demo is not likley produce values that
#fall outside of Standard Distribution
#if this demo was to Calculate with real data, we want to know
#how many non-Standard data points we have.
More = (More + Frequency[b])
More = abs((Records - More))
#Add the More value
sqlBinFreqInsert = "insert into Bins (Bin, Val, Frequency) values ({0}, {1}, {2})". \
format((BinCount + 1), float(0), More)
cur.execute(sqlBinFreqInsert)
#Now Report the Analysis
print("================================ The Population ==============================")
print(" {0} {1} {2} {3} {4} {5}". \
format("Size".rjust(10, ' '), \
"Max".rjust(10, ' '), \
"Min".rjust(10, ' '), \
"Mean".rjust(10, ' '), \
"Range".rjust(10, ' '), \
"Stdev".rjust(10, ' ')))
print("Aggregates: {0:10d} {1:10.4f} {2:10.4f} {3:10.4f} {4:10.4f} {5:10.4f}". \
format(Population, Max, Min, Average, Range, Stdev))
print("================================= The Bell Curve =============================")
LabelString = "{0} {1} {2} {3}". \
format("Bin".ljust(8, ' '), \
"Ranges".rjust(8, ' '), \
"Frequency".rjust(8, ' '), \
"Histogram".rjust(6, ' '))
print(LabelString)
print("------------------------------------------------------------------------------")
#Let's Paint a Histogram
sqlChart = "select * from Bins order by Bin asc"
for rowChart in cur.execute(sqlChart):
if (rowChart['Bin'] == 7):
#Bin 7 is not really a bin, but where we place the values that did not fit into the
#Normal Distribution. This script was tested against Excel's Bell Curve Example
#https://support.microsoft.com/en-us/kb/213930
#and produces the same results. Feel free to test it.
BinName = "More"
ChartString = "{0:<6} {1:<10} {2:10.0f}". \
format(BinName, \
"", \
More)
else:
#Theses are the actual bins where values fall within the distribution.
BinName = (rowChart['Bin'] + 1)
#Scale the Chart
fPercent = ((float(rowChart['Frequency']) / float(Records) * 100))
iPrecent = int(math.ceil(fPercent))
ChartString = "{0:<6} {1:10.4f} {2:10.0f} {3}". \
format(BinName, \
rowChart['Val'], \
rowChart['Frequency'], \
"".rjust(iPrecent, '#'))
print(ChartString)
print("******************************************************************************")
#Commit to Database
con.commit()
#Clean Up
cur.close()
con.close()
except Exception as EXBellCurve:
pass
TraceInfo = traceback.format_exc()
raise Exception(TraceInfo)

Resources