I am trying to use nvprof to monitor the performance of the GPU. I would like to know the time consuming of HtoD(host to device), DtoH(device to host) and device execution.
It worked very well with a standard code from numba cuda website:
from numba import cuda
#cuda.jit
def add_kernel(x, y, out):
tx = cuda.threadIdx.x # this is the unique thread ID within a 1D block
ty = cuda.blockIdx.x # Similarly, this is the unique block ID within the 1D grid
block_size = cuda.blockDim.x # number of threads per block
grid_size = cuda.gridDim.x # number of blocks in the grid
start = tx + ty * block_size
stride = block_size * grid_size
# assuming x and y inputs are same length
for i in range(start, x.shape[0], stride):
out[i] = x[i] + y[i]
if __name__ == "__main__":
import numpy as np
n = 100000
x = np.arange(n).astype(np.float32)
y = 2 * x
out = np.empty_like(x)
threads_per_block = 128
blocks_per_grid = 30
add_kernel[blocks_per_grid, threads_per_block](x, y, out)
print(out[:10])
Here is the out come from nvprfo:
However, when I add the usage of multiprocessing with the following code:
import multiprocessing as mp
from numba import cuda
def fun():
#cuda.jit
def add_kernel(x, y, out):
tx = cuda.threadIdx.x # this is the unique thread ID within a 1D block
ty = cuda.blockIdx.x # Similarly, this is the unique block ID within the 1D grid
block_size = cuda.blockDim.x # number of threads per block
grid_size = cuda.gridDim.x # number of blocks in the grid
start = tx + ty * block_size
stride = block_size * grid_size
# assuming x and y inputs are same length
for i in range(start, x.shape[0], stride):
out[i] = x[i] + y[i]
import numpy as np
n = 100000
x = np.arange(n).astype(np.float32)
y = 2 * x
out = np.empty_like(x)
threads_per_block = 128
blocks_per_grid = 30
add_kernel[blocks_per_grid, threads_per_block](x, y, out)
print(out[:10])
return out
# check gpu condition
p = mp.Process(target = fun)
p.daemon = True
p.start()
p.join()
nvprof seems to monitor the process but it doesn't outcome anything though it reports that nvprof is profiling:
Furthermore, when I used Ray (a package for doing distributed computation):
if __name__ == "__main__":
import multiprocessing
def fun():
from numba import cuda
import ray
#ray.remote(num_gpus=1)
def call_ray():
#cuda.jit
def add_kernel(x, y, out):
tx = cuda.threadIdx.x # this is the unique thread ID within a 1D block
ty = cuda.blockIdx.x # Similarly, this is the unique block ID within the 1D grid
block_size = cuda.blockDim.x # number of threads per block
grid_size = cuda.gridDim.x # number of blocks in the grid
start = tx + ty * block_size
stride = block_size * grid_size
# assuming x and y inputs are same length
for i in range(start, x.shape[0], stride):
out[i] = x[i] + y[i]
import numpy as np
n = 100000
x = np.arange(n).astype(np.float32)
y = 2 * x
out = np.empty_like(x)
threads_per_block = 128
blocks_per_grid = 30
add_kernel[blocks_per_grid, threads_per_block](x, y, out)
print(out[:10])
return out
ray.shutdown()
ray.init(redis_address = "***")
out = ray.get(call_ray.remote())
# check gpu condition
p = multiprocessing.Process(target = fun)
p.daemon = True
p.start()
p.join()
nvprof doesn't show anything! It even doesn't show the line telling that nvprof is profiling the process (but the code is indeed executed):
Does anyone know how to figure this out? Or do I have any other choices to acquire these data for distributed computation?
Related
I'm trying to distribute a 2D matrix over a number of processors with some overlapping regions. I noticed that the function scatterv of mpi4py doesn't work with 2d arrays, I was forced then to use 1D arrays. The problem here is that the displacements in some processors can be greater than the limit of int. How can I solve this problem please?
import numpy as np
from mpi4py import MPI
from math import ceil
# Dimensions of the matrix
N = 50000
M = 50000
comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()
n = ceil(N/size)
offset = N-(size-1)*n # Offset
start = np.zeros(size, dtype=np.float64) # Start rows of the chunks
end = np.zeros(size, dtype=np.float64) # End rows of the chunks
sz_loc = np.zeros(size, dtype=np.float64) # Size of he local chunk
for i in range(size):
pp = n
if i == size - 1:
pp = offset
start[i] = max(0, i * n - mf)
end[i] = min(N, i * n + pp + mf)
sz_loc[i] = (end[i]-start[i])*M
if rank == 0:
Im = np.array(np.round(10*np.random.rand(N, M)),
dtype=np.float32).ravel()
else:
Im = None
Im_loc = np.zeros((int(sz_loc[rank]),), dtype=np.float32)
comm.Barrier()
comm.Scatterv([Im, sz_loc, start*M, MPI.FLOAT], Im_loc, root=0)
Here is the error I get, using for the example 10 processors
comm.Scatterv([Im, sz_loc, start*M, MPI.FLOAT], Im_loc, root=0)
File "mpi4py/MPI/Comm.pyx", line 626, in mpi4py.MPI.Comm.Scatterv
File "mpi4py/MPI/msgbuffer.pxi", line 538, in mpi4py.MPI._p_msg_cco.for_scatter
File "mpi4py/MPI/msgbuffer.pxi", line 440, in mpi4py.MPI._p_msg_cco.for_cco_send
File "mpi4py/MPI/msgbuffer.pxi", line 313, in mpi4py.MPI.message_vector
File "mpi4py/MPI/asarray.pxi", line 22, in mpi4py.MPI.chkarray
File "mpi4py/MPI/asarray.pxi", line 15, in mpi4py.MPI.getarray
OverflowError: value too large to convert to int
Finally I definded a new data type using Create_contiguous() function.
I'm using CVXPY through Julia, a language where multidimentional arrays are stored in memory using column-major order. However CVXPY is written in Python and accepts Numpy style arrays (which are row-major by default) to be used as constants.
I want to know if I should care about ordering, e.g. with matrix A, when translating Python code like this:
import cvxpy as cp
import numpy as np
m = 30
n = 20
A = np.random.randn(m, n)
b = np.random.randn(m)
# Construct the problem.
x = cp.Variable(n)
objective = cp.Minimize(cp.sum_squares(A*x - b))
constraints = [0 <= x, x <= 1]
prob = cp.Problem(objective, constraints)
to Julia:
using Random, PyCall
cp = pyimport("cvxpy")
m = 30
n = 20
A = randn(m, n)
b = randn(m)
# Construct the problem.
x = cp.Variable(n)
objective = cp.Minimize(cp.sum_squares(cp.matmul(A,x) - b))
constraints = [0 <= x, x <= 1]
prob = cp.Problem(objective, constraints)
I've seen exactly this done, and not seen any special handling from want I recall.
So no, it doesn't cause any issues
I am producing code for bridge sampling for Brownian motion to simulate sample paths but I keep getting all zeros for my answer. I have got my code and picture algorithm below.
#Brownian bridge for GBM
Z<-rnorm(1, mean=0,sd=1)
T=1
W_[0]=0
W_[T]=sqrt(T)*Z
k=5
j<-2^(k-1)
W_=rep(NA,nt-1)
for(k in 1:K){
h=h/2
for(j in 1:j){
Z<-rnorm(1,mean=0,sd=1)
W_[2*(j-1)*h]=0.5*(W_[2*(j-1)*h]+W_[2*j*h])+sqrt(h)*Z
print(W_[2*(j-1)*h])
}
}
The algorithm is below:
If I am not mistaken with the formulas, in Python it should be something like this:
import numpy as np
import pandas as pd
# Samples a Brownian bridge in the [a,b] interval, see https://en.wikipedia.org/wiki/Brownian_bridge#General_case
def BB(
T: int = 252,
a: float = 0,
b: float = 0,
alpha: float = 0,
sigma: float = 1
) -> pd.Series:
X0 = a
X = pd.Series([X0])
t1 = 0
t2 = T
for i in np.arange(1, T):
t = i
mu = (a + (t - t1)/(t2 - t1)*(b - a))
s = ((t2 - t)*(t - t1)/(t2 - t1))
ei = np.random.normal(alpha+mu, sigma+s)
X.loc[i] = ((T - t)/T**0.5)*ei*((t/(T-t))**0.5)
XT = b
X[T] = XT
return X
Show it:
from matplotlib import pyplot as plt
plt.plot(BB(a=0,b=0, T=100))
I have a graph where each node has coordinates in 2D (it's actually a geographic graph, with latitude and longitude.)
I need to verify that if the distance between two edges is less than MAX_DIST then they share a node. Of course, if they intersect, then the distance between them is zero.
The brute force algorithm is trivial, is there a more efficient algorithm?
I was thinking of trying to adapt https://en.wikipedia.org/wiki/Closest_pair_of_points_problem to graph edges (and ignoring pairs of edges with a shared node), but it is not trivial to do so.
I was curios to see how the rtree index idea would perform so I created a small script to test it using two really cool libraries for Python: Rtree and shapely
The snippet generates 1000 segments with 1 < length < 5 and coordinates in the [0, 100] interval, populates the index and then counts the pairs that are closer than MAX_DIST==0.1 (using the classic and the index-based method).
In my tests the index method was around 25x faster using the conditions above; this might vary greatly for your data set but the result is encouraging:
found 532 pairs of close segments using classic method
7.47 seconds for classic count
found 532 pairs of close segments using index method
0.28 seconds for index count
The performance and correctness of the index method depends on how your segments are distributed (how many are close, if you have very long segments, the parameters used).
import time
import random
from rtree import Rtree
from shapely.geometry import LineString
def generate_segments(number):
segments = {}
for i in range(number):
while True:
x1 = random.randint(0, 100)
y1 = random.randint(0, 100)
x2 = random.randint(0, 100)
y2 = random.randint(0, 100)
segment = LineString([(x1, y1), (x2, y2)])
if 1 < segment.length < 5: # only add relatively small segments
segments[i] = segment
break
return segments
def populate_index(segments):
idx = Rtree()
for index, segment in segments.items():
idx.add(index, segment.bounds)
return idx
def count_close_segments(segments, max_distance):
count = 0
for i in range(len(segments)-1):
s1 = segments[i]
for j in range(i+1, len(segments)):
s2 = segments[j]
if s1.distance(s2) < max_distance:
count += 1
return count
def count_close_segments_index(segments, idx, max_distance):
count = 0
for index, segment in segments.items():
close_indexes = idx.nearest(segment.bounds, 10)
for close_index in close_indexes:
if index >= close_index: # do not count duplicates
continue
close_segment = segments[close_index]
if segment.distance(close_segment) < max_distance:
count += 1
return count
if __name__ == "__main__":
MAX_DIST = 0.1
s = generate_segments(1000)
r_idx = populate_index(s)
t = time.time()
print("found %d pairs of close segments using classic method" % count_close_segments(s, MAX_DIST))
print("%.2f seconds for classic count" % (time.time() - t))
t = time.time()
print("found %d pairs of close segments using index method" % count_close_segments_index(s, r_idx, MAX_DIST))
print("%.2f seconds for index count" % (time.time() - t))
https://en.wikipedia.org/wiki/Superellipse
I have read the SO questions on how to point-pick from a circle and an ellipse.
How would one uniformly select random points from the interior of a super-ellipse?
More generally, how would one uniformly select random points from the interior of the curve described by an arbitrary super-formula?
https://en.wikipedia.org/wiki/Superformula
The discarding method is not considered a solution, as it is mathematically unenlightening.
In order to sample the superellipse, let's assume without loss of generality that a = b = 1. The general case can be then obtained by rescaling the corresponding axis.
The points in the first quadrant (positive x-coordinate and positive y-coordinate) can be then parametrized as:
x = r * ( cos(t) )^(2/n)
y = r * ( sin(t) )^(2/n)
with 0 <= r <= 1 and 0 <= t <= pi/2:
Now, we need to sample in r, t so that the sampling transformed into x, y is uniform. To this end, let's calculate the Jacobian of this transform:
dx*dy = (2/n) * r * (sin(2*t)/2)^(2/n - 1) dr*dt
= (1/n) * d(r^2) * d(f(t))
Here, we see that as for the variable r, it is sufficient to sample uniformly the value of r^2 and then transform back with a square root. The dependency on t is a bit more complicated. However, with some effort, one gets
f(t) = -(n/2) * 2F1(1/n, (n-1)/n, 1 + 1/n, cos(t)^2) * cos(t)^(2/n)
where 2F1 is the hypergeometric function.
In order to obtain uniform sampling in x,y, we need now to sample uniformly the range of f(t) for t in [0, pi/2] and then find the t which corresponds to this sampled value, i.e., to solve for t the equation u = f(t) where u is a uniform random variable sampled from [f(0), f(pi/2)]. This is essentially the same method as for r, nevertheless in that case one can calculate the inverse directly.
One small issue with this approach is that the function f is not that well-behaved near zero - the infinite slope makes it quite challenging to find a root of u = f(t). To circumvent this, we can sample only the "upper part" of the first quadrant (i.e., area between lines x=y and x=0) and then obtain all the other points by symmetry (not only in the first quadrant but also for all the other ones).
An implementation of this method in Python could look like:
import numpy as np
from numpy.random import uniform, randint, seed
from scipy.optimize import brenth, ridder, bisect, newton
from scipy.special import gamma, hyp2f1
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
seed(100)
def superellipse_area(n):
#https://en.wikipedia.org/wiki/Superellipse#Mathematical_properties
inv_n = 1. / n
return 4 * ( gamma(1 + inv_n)**2 ) / gamma(1 + 2*inv_n)
def sample_superellipse(n, num_of_points = 2000):
def f(n, x):
inv_n = 1. / n
return -(n/2)*hyp2f1(inv_n, 1 - inv_n, 1 + inv_n, x)*(x**inv_n)
lb = f(n, 0.5)
ub = f(n, 0.0)
points = [None for idx in range(num_of_points)]
for idx in range(num_of_points):
r = np.sqrt(uniform())
v = uniform(lb, ub)
w = bisect(lambda w: f(n, w**n) - v, 0.0, 0.5**(1/n))
z = w**n
x = r * z**(1/n)
y = r * (1 - z)**(1/n)
if uniform(-1, 1) < 0:
y, x = x, y
x = (2*randint(0, 2) - 1)*x
y = (2*randint(0, 2) - 1)*y
points[idx] = [x, y]
return points
def plot_superellipse(ax, n, points):
coords_x = [p[0] for p in points]
coords_y = [p[1] for p in points]
ax.set_xlim(-1.25, 1.25)
ax.set_ylim(-1.25, 1.25)
ax.text(-1.1, 1, '{n:.1f}'.format(n = n), fontsize = 12)
ax.scatter(coords_x, coords_y, s = 0.6)
params = np.array([[0.5, 1], [2, 4]])
fig = plt.figure(figsize = (6, 6))
gs = gridspec.GridSpec(*params.shape, wspace = 1/32., hspace = 1/32.)
n_rows, n_cols = params.shape
for i in range(n_rows):
for j in range(n_cols):
n = params[i, j]
ax = plt.subplot(gs[i, j])
if i == n_rows-1:
ax.set_xticks([-1, 0, 1])
else:
ax.set_xticks([])
if j == 0:
ax.set_yticks([-1, 0, 1])
else:
ax.set_yticks([])
#ensure that the ellipses have similar point density
num_of_points = int(superellipse_area(n) / superellipse_area(2) * 4000)
points = sample_superellipse(n, num_of_points)
plot_superellipse(ax, n, points)
fig.savefig('fig.png')
This produces: