Related
I'm new to OpenCL, with very limited background in C/C++.
I've been given this OpenCL program that adds two vectors, and supposed to figure out how it works. It comes from Intel:
https://www.intel.com/content/www/us/en/programmable/support/support-resources/design-examples/design-software/opencl/vector-addition.html
Would it be correct to say: each kernel uses 1 element from A and 1 element from B to calculate 1 element of Z?
To me, it looks like it determines the number of devices (num_devices), and essentially divides the problem size (N) by num_devices, to determine the number of elements per device (n_per_device[]). Then it creates arrays of random numbers for each device (input_a[] and input_b[]) with n_per_device number of elements.
Then these arrays are used by the kernel, where addition of the whole array is performed and stored as Z.
For example, say if the number of devices available is 1000, and problem size (N) is 1,000,000; the n_per_device is 1000 (and since there is no remainder it is the same for all), and it would generate 1000 arrays of input_a and input_b, with 1000 elements in each. Then a respective pair of arrays of 1000 elements are taken by the kernel and added together - in other words each execution of the kernel adds 1000 elements?
Am I following anything, or totally wrong here?
The kernel is:
// ACL kernel for adding two input vectors
__kernel void vectorAdd(__global const float *x,
__global const float *y,
__global float *restrict z)
{
// get index of the work item
int index = get_global_id(0);
// add the vector elements
z[index] = x[index] + y[index];
}
The host (main) code is (sorry it is long, not sure what's not important):
///////////////////////////////////////////////////////////////////////////////////
// This host program executes a vector addition kernel to perform:
// C = A + B
// where A, B and C are vectors with N elements.
//
// This host program supports partitioning the problem across multiple OpenCL
// devices if available. If there are M available devices, the problem is
// divided so that each device operates on N/M points. The host program
// assumes that all devices are of the same type (that is, the same binary can
// be used), but the code can be generalized to support different device types
// easily.
//
// Verification is performed against the same computation on the host CPU.
///////////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "CL/opencl.h"
#include "AOCL_Utils.h"
using namespace aocl_utils;
// OpenCL runtime configuration
cl_platform_id platform = NULL;
unsigned num_devices = 0;
scoped_array<cl_device_id> device; // num_devices elements
cl_context context = NULL;
scoped_array<cl_command_queue> queue; // num_devices elements
cl_program program = NULL;
scoped_array<cl_kernel> kernel; // num_devices elements
scoped_array<cl_mem> input_a_buf; // num_devices elements
scoped_array<cl_mem> input_b_buf; // num_devices elements
scoped_array<cl_mem> output_buf; // num_devices elements
// Problem data.
const unsigned N = 1000000; // problem size
scoped_array<scoped_aligned_ptr<float> > input_a, input_b; // num_devices elements
scoped_array<scoped_aligned_ptr<float> > output; // num_devices elements
scoped_array<scoped_array<float> > ref_output; // num_devices elements
scoped_array<unsigned> n_per_device; // num_devices elements
// Function prototypes
float rand_float();
bool init_opencl();
void init_problem();
void run();
void cleanup();
// Entry point.
int main() {
// Initialize OpenCL.
if(!init_opencl()) {
return -1;
}
// Initialize the problem data.
// Requires the number of devices to be known.
init_problem();
// Run the kernel.
run();
// Free the resources allocated
cleanup();
return 0;
}
/////// HELPER FUNCTIONS ///////
// Randomly generate a floating-point number between -10 and 10.
float rand_float() {
return float(rand()) / float(RAND_MAX) * 20.0f - 10.0f;
}
// Initializes the OpenCL objects.
bool init_opencl() {
cl_int status;
printf("Initializing OpenCL\n");
if(!setCwdToExeDir()) {
return false;
}
// Get the OpenCL platform.
platform = findPlatform("Altera");
if(platform == NULL) {
printf("ERROR: Unable to find Altera OpenCL platform.\n");
return false;
}
// Query the available OpenCL device.
device.reset(getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices));
printf("Platform: %s\n", getPlatformName(platform).c_str());
printf("Using %d device(s)\n", num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
printf(" %s\n", getDeviceName(device[i]).c_str());
}
// Create the context.
context = clCreateContext(NULL, num_devices, device, NULL, NULL, &status);
checkError(status, "Failed to create context");
// Create the program for all device. Use the first device as the
// representative device (assuming all device are of the same type).
std::string binary_file = getBoardBinaryFile("vectorAdd", device[0]);
printf("Using AOCX: %s\n", binary_file.c_str());
program = createProgramFromBinary(context, binary_file.c_str(), device, num_devices);
// Build the program that was just created.
status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
checkError(status, "Failed to build program");
// Create per-device objects.
queue.reset(num_devices);
kernel.reset(num_devices);
n_per_device.reset(num_devices);
input_a_buf.reset(num_devices);
input_b_buf.reset(num_devices);
output_buf.reset(num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
// Command queue.
queue[i] = clCreateCommandQueue(context, device[i], CL_QUEUE_PROFILING_ENABLE, &status);
checkError(status, "Failed to create command queue");
// Kernel.
const char *kernel_name = "vectorAdd";
kernel[i] = clCreateKernel(program, kernel_name, &status);
checkError(status, "Failed to create kernel");
// Determine the number of elements processed by this device.
n_per_device[i] = N / num_devices; // number of elements handled by this device
// Spread out the remainder of the elements over the first
// N % num_devices.
if(i < (N % num_devices)) {
n_per_device[i]++;
}
// Input buffers.
input_a_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
n_per_device[i] * sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for input A");
input_b_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
n_per_device[i] * sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for input B");
// Output buffer.
output_buf[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
n_per_device[i] * sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for output");
}
return true;
}
// Initialize the data for the problem. Requires num_devices to be known.
void init_problem() {
if(num_devices == 0) {
checkError(-1, "No devices");
}
input_a.reset(num_devices);
input_b.reset(num_devices);
output.reset(num_devices);
ref_output.reset(num_devices);
// Generate input vectors A and B and the reference output consisting
// of a total of N elements.
// We create separate arrays for each device so that each device has an
// aligned buffer.
for(unsigned i = 0; i < num_devices; ++i) {
input_a[i].reset(n_per_device[i]);
input_b[i].reset(n_per_device[i]);
output[i].reset(n_per_device[i]);
ref_output[i].reset(n_per_device[i]);
for(unsigned j = 0; j < n_per_device[i]; ++j) {
input_a[i][j] = rand_float();
input_b[i][j] = rand_float();
ref_output[i][j] = input_a[i][j] + input_b[i][j];
}
}
}
void run() {
cl_int status;
const double start_time = getCurrentTimestamp();
// Launch the problem for each device.
scoped_array<cl_event> kernel_event(num_devices);
scoped_array<cl_event> finish_event(num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
// Transfer inputs to each device. Each of the host buffers supplied to
// clEnqueueWriteBuffer here is already aligned to ensure that DMA is used
// for the host-to-device transfer.
cl_event write_event[2];
status = clEnqueueWriteBuffer(queue[i], input_a_buf[i], CL_FALSE,
0, n_per_device[i] * sizeof(float), input_a[i], 0, NULL, &write_event[0]);
checkError(status, "Failed to transfer input A");
status = clEnqueueWriteBuffer(queue[i], input_b_buf[i], CL_FALSE,
0, n_per_device[i] * sizeof(float), input_b[i], 0, NULL, &write_event[1]);
checkError(status, "Failed to transfer input B");
// Set kernel arguments.
unsigned argi = 0;
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_a_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_b_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &output_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
// Enqueue kernel.
// Use a global work size corresponding to the number of elements to add
// for this device.
//
// We don't specify a local work size and let the runtime choose
// (it'll choose to use one work-group with the same size as the global
// work-size).
//
// Events are used to ensure that the kernel is not launched until
// the writes to the input buffers have completed.
const size_t global_work_size = n_per_device[i];
printf("Launching for device %d (%d elements)\n", i, global_work_size);
status = clEnqueueNDRangeKernel(queue[i], kernel[i], 1, NULL,
&global_work_size, NULL, 2, write_event, &kernel_event[i]);
checkError(status, "Failed to launch kernel");
// Read the result. This the final operation.
status = clEnqueueReadBuffer(queue[i], output_buf[i], CL_FALSE,
0, n_per_device[i] * sizeof(float), output[i], 1, &kernel_event[i], &finish_event[i]);
// Release local events.
clReleaseEvent(write_event[0]);
clReleaseEvent(write_event[1]);
}
// Wait for all devices to finish.
clWaitForEvents(num_devices, finish_event);
const double end_time = getCurrentTimestamp();
// Wall-clock time taken.
printf("\nTime: %0.3f ms\n", (end_time - start_time) * 1e3);
// Get kernel times using the OpenCL event profiling API.
for(unsigned i = 0; i < num_devices; ++i) {
cl_ulong time_ns = getStartEndTime(kernel_event[i]);
printf("Kernel time (device %d): %0.3f ms\n", i, double(time_ns) * 1e-6);
}
// Release all events.
for(unsigned i = 0; i < num_devices; ++i) {
clReleaseEvent(kernel_event[i]);
clReleaseEvent(finish_event[i]);
}
// Verify results.
bool pass = true;
for(unsigned i = 0; i < num_devices && pass; ++i) {
for(unsigned j = 0; j < n_per_device[i] && pass; ++j) {
if(fabsf(output[i][j] - ref_output[i][j]) > 1.0e-5f) {
printf("Failed verification # device %d, index %d\nOutput: %f\nReference: %f\n",
i, j, output[i][j], ref_output[i][j]);
pass = false;
}
}
}
printf("\nVerification: %s\n", pass ? "PASS" : "FAIL");
}
// Free the resources allocated during initialization
void cleanup() {
for(unsigned i = 0; i < num_devices; ++i) {
if(kernel && kernel[i]) {
clReleaseKernel(kernel[i]);
}
if(queue && queue[i]) {
clReleaseCommandQueue(queue[i]);
}
if(input_a_buf && input_a_buf[i]) {
clReleaseMemObject(input_a_buf[i]);
}
if(input_b_buf && input_b_buf[i]) {
clReleaseMemObject(input_b_buf[i]);
}
if(output_buf && output_buf[i]) {
clReleaseMemObject(output_buf[i]);
}
}
if(program) {
clReleaseProgram(program);
}
if(context) {
clReleaseContext(context);
}
}
There are a few sub-questions here, so let me try and address them individually. I'm going to be slightly pedantic on terminology; I'm not doing that to be snarky but hopefully this will help you make more sense of documentation, examples, etc.:
Would it be correct to say: each kernel uses 1 element from A and 1 element from B to calculate 1 element of Z?
The kernel is just the code that will run on the OpenCL device. Typically, a kernel is scheduled to run (using clEnqueueNDRangeKernel()) with multiple work-items. With just one work item, there is not much point in bothering with OpenCL at all; the performance benefit comes from massive parallelism. In any case, your quoted statement is correct for each individual work-item processing this kernel. If you run this kernel with 1000 work items, 1000 elements from A will be processed with 1000 elements from B to calculate 1000 elements of Z. The order this happens in is deliberately undefined, and at least groups of elements will be operated on concurrently.
To me, it looks like it determines the number of devices (num_devices), and essentially divides the problem size (N) by num_devices, to determine the number of elements per device (n_per_device[]). Then it creates arrays of random numbers for each device (input_a[] and input_b[]) with n_per_device number of elements.
Yes, it looks like that to me too.
For example, say if the number of devices available is 1000,
I would just like to point out that you will pretty much never have this many OpenCL devices in a system. The granularity of a single OpenCL device is typically "one GPU," or "all the CPU cores in the system," or "one FPGA accelerator card."
So a "normal" amount of devices on a desktop system is 1, 2, or maybe up to about 4 (e.g. CPU + iGPU + dual discrete GPUs). Big irons with many accelerator cards might have ~16 or so. If you're attempting to accelerate some code in a desktop (or small server) application, you'll usually just pick one device that's likely to be the most appropriate for your problem and run with that. Distributing workload evenly across heterogenous devices is a hard problem for anything but the most basic algorithms.
and problem size (N) is 1,000,000; the n_per_device is 1000 (and since there is no remainder it is the same for all), and it would generate 1000 arrays of input_a and input_b, with 1000 elements in each. Then a respective pair of arrays of 1000 elements are taken by the kernel and added together -
Yes.
in other words each execution of the kernel adds 1000 elements?
Again, this is where using the term "kernel" isn't precise enough. In your example, you would enqueue 1000 work items to execute the kernel on each of the 1000 devices.
I'm writing an application that displays a lot of text. It's not words and sentences though, it's binary data displayed in CP437 charset. Current form:
I'm having a problem though with drawing those characters. I need to draw each character one by one, because later I would like to apply different coloring. Those characters should have a transparent background as well, because later I would like to draw sections and ranges with different colors in the background (to group those characters based on some criteria).
The application supports multiple opened files at the same time, but when there are multiple files opened, the drawing starts to be noticeable on fast i7, so it's probably badly written.
What would be the best approach to draw this kind of data in Qt5? Should I just prerender characters to a bitmap and start from there, or it actually is possible to draw lots of characters by using normal Qt functions to draw text?
Edit: I'm using a normal QFrame widget that does drawing in paintEvent, using QPainter. Is this a wrong approach? I've read some docs on QGraphicsScene, from which I've remembered that it's best used in situations where a widget needs to have some control on the objects it draws. I don't need any control on what I draw; I just need to draw it and that's all. I won't reference any particular character after I'll draw it.
The widget has 2000 lines, so I won't paste the whole code, but currently my drawing approach is like this:
First, create a table (cache) with 256 entries, put the iterator counter to i variable,
For each entry, create a QStaticText object that contains drawing information about a character identified by ASCII code taken from i variable,
Later, in the drawing function, for each byte in the input stream (i.e. from the file), draw the data using QStaticText from the cache table. So, to draw ASCII character 0x7A, I'll look up QStaticText from index 0x7a in cache table, and feed this QStaticText object into the QPainter object.
I was also experimenting with a different approach, rendering the whole line in one QPainter::drawText call, and indeed it was faster, but I've lost possibility of coloring each character with different color. I would like to have this possibility.
The use of a QGraphicsScene wouldn't improve things - it's an additional layer on top of a QWidget. You're after raw performance, so you shouldn't be using it.
You could implement a QTextDocument as a viewmodel for the visible section of your memory buffer/file, but painting the fresh QTextDocument each time you scroll wouldn't be any faster than drawing things directly on a QWidget.
Using QStaticText is a step in the right direction, but insufficient: rendering QStaticText still requires the rasterization of the glyph's shape. You can do better and cache the pixmap of each QChar, QColor combination that you wish to render: this will be much faster than rasterizing character outlines, whether using QStaticText or not.
Instead of drawing individual characters, you then draw pixmaps from the cache. This commit demonstrates this approach. The character drawing method is:
void drawChar(const QPointF & pos, QChar ch, QColor color, QPainter & p) {
auto & glyph = m_cache[{ch, color}];
if (glyph.isNull()) {
glyph = QPixmap{m_glyphRect.size().toSize()};
glyph.fill(Qt::white);
QPainter p{&glyph};
p.setPen(color);
p.setFont(m_font);
p.drawText(m_glyphPos, {ch});
}
p.drawPixmap(pos, glyph);
}
You could also cache each (character,foreground,background) tuple. Alas, this gets quickly out of hand when there are many foreground/background combinations.
If all of your backgrounds are of the same color (e.g. white), you'd wish to store a negative mask of the character: the glyph has a white background and a transparent shape. This commit demonstrates this approach. The glyph rectangle is filled with glyph color, then a white mask is applied on top:
void drawChar(const QPointF & pos, QChar ch, QColor color, QPainter & p) {
auto & glyph = m_glyphs[ch];
if (glyph.isNull()) {
glyph = QImage{m_glyphRect.size().toSize(), QImage::Format_ARGB32_Premultiplied};
glyph.fill(Qt::white);
QPainter p{&glyph};
p.setCompositionMode(QPainter::CompositionMode_DestinationOut);
p.setFont(m_font);
p.drawText(m_glyphPos, {ch});
}
auto rect = m_glyphRect;
rect.moveTo(pos);
p.fillRect(rect, color);
p.drawImage(pos, glyph);
}
Instead of storing a fully pre-rendered character of a given color, you could store just the alpha mask and composite them on-demand:
Start with a pre-rendered white glyph on a transparent background (CompositionMode_Source).
Fill the glyph rect with background in CompositionMode_SourceOut: the background will remain with a hole for the character itself.
Fill the glyph rect with foreground in CompositionMode_DestinationOver: the foreground will fill the hole.
(Optional) Draw the composite on the widget, if you're not painting on the widget already.
This turns out to be reasonably fast, and the rendering is fully parallelizable - see the example below.
Note: The pre-rendered glyph could use further premultiplication of the color with alpha to appear less thick.
Yet another approach, with excellent performance, would be to emulate a text-mode display using the GPU. Store the pre-rendered glyph outlines in a texture, store the glyph indices and colors to be rendered in an array, and use OpenGL and two shaders to do the rendering. This example might be a starting point to implement such an approach.
A complete example, using CPU rendering across multiple threads, follows.
We start with the backing store view, used to produce QImages that are views into the backing store for a given widget, and can be used to parallelize painting.
On a 2013 iMac, this code repaints the full-screen widget in about 8ms.
// https://github.com/KubaO/stackoverflown/tree/master/questions/hex-widget-40458515
#include <QtConcurrent>
#include <QtWidgets>
#include <algorithm>
#include <array>
#include <cmath>
struct BackingStoreView {
QImage *dst = {};
uchar *data = {};
const QWidget *widget = {};
explicit BackingStoreView(const QWidget *widget) {
if (!widget || !widget->window()) return;
dst = dynamic_cast<QImage*>(widget->window()->backingStore()->paintDevice());
if (!dst || dst->depth() % 8) return;
auto byteDepth = dst->depth()/8;
auto pos = widget->mapTo(widget->window(), {});
data = const_cast<uchar*>(dst->constScanLine(pos.y()) + byteDepth * pos.x());
this->widget = widget;
}
// A view onto the backing store of a given widget
QImage getView() const {
if (!data) return {};
QImage ret(data, widget->width(), widget->height(), dst->bytesPerLine(), dst->format());
ret.setDevicePixelRatio(widget->devicePixelRatio());
return ret;
}
// Is a given image exactly this view?
bool isAView(const QImage &img) const {
return data && img.bits() == data && img.depth() == dst->depth()
&& img.width() == widget->width() && img.height() == widget->height()
&& img.bytesPerLine() == dst->bytesPerLine() && img.format() == dst->format();
}
};
Then, the CP437 character set:
static auto const CP437 = QStringLiteral(
" ☺☻♥♦♣♠•◘○◙♂♀♪♫☼▶◀↕‼¶§▬↨↑↓→←∟↔▲▼"
"␣!\"#$%&'()*+,-./0123456789:;<=>?"
"#ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"
"`abcdefghijklmnopqrstuvwxyz{|}~ "
"ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒ"
"áíóúñѪº¿⌐¬½¼¡«»░▒▓│┤╡╢╖╕╣║╗╝╜╛┐"
"└┴┬├─┼╞╟╚╔╩╦╠═╬╧╨╤╥╙╘╒╓╫╪┘┌█▄▌▐▀"
"αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²■ ");
The HexView widget derives from QAbstractScrollArea and visualizes a memory-mapped chunk of data:
class HexView : public QAbstractScrollArea {
Q_OBJECT
QImage const m_nullImage;
const int m_addressChars = 8;
const int m_dataMargin = 4;
const char * m_data = {};
size_t m_dataSize = 0;
size_t m_dataStart = 0;
QSize m_glyphSize;
QPointF m_glyphPos;
int m_charsPerLine, m_lines;
QMap<QChar, QImage> m_glyphs;
QFont m_font{"Monaco"};
QFontMetricsF m_fm{m_font};
struct DrawUnit { QPoint pos; const QImage *glyph; QColor fg, bg; };
QFutureSynchronizer<void> m_sync;
QVector<DrawUnit> m_chunks;
QVector<QImage> m_stores;
using chunk_it = QVector<DrawUnit>::const_iterator;
using store_it = QVector<QImage>::const_iterator;
static inline QChar decode(char ch) { return CP437[uchar(ch)]; }
inline int xStep() const { return m_glyphSize.width(); }
inline int yStep() const { return m_glyphSize.height(); }
void initData() {
int const width = viewport()->width() - m_addressChars*xStep() - m_dataMargin;
m_charsPerLine = (width > 0) ? width/xStep() : 0;
m_lines = viewport()->height()/yStep();
if (m_charsPerLine && m_lines) {
verticalScrollBar()->setRange(0, m_dataSize/m_charsPerLine);
verticalScrollBar()->setValue(m_dataStart/m_charsPerLine);
} else {
verticalScrollBar()->setRange(0, 0);
}
}
const QImage &glyph(QChar ch) {
auto &glyph = m_glyphs[ch];
if (glyph.isNull()) {
QPointF extent = m_fm.boundingRect(ch).translated(m_glyphPos).bottomRight();
glyph = QImage(m_glyphSize, QImage::Format_ARGB32_Premultiplied);
glyph.fill(Qt::transparent);
QPainter p{&glyph};
p.setPen(Qt::white);
p.setFont(m_font);
p.translate(m_glyphPos);
p.scale(std::min(1.0, (m_glyphSize.width()-1)/extent.x()),
std::min(1.0, (m_glyphSize.height()-1)/extent.y()));
p.drawText(QPointF{}, {ch});
}
return glyph;
}
The parallelized rendering is done in class methods - they don't modify the state of the widget, other than accessing read-only data, and rendering into the backing store. The threads each act on isolated lines in the store.
static void drawChar(const DrawUnit & u, QPainter &p) {
const QRect rect(u.pos, u.glyph->size());
p.setCompositionMode(QPainter::CompositionMode_Source);
p.drawImage(u.pos, *u.glyph);
p.setCompositionMode(QPainter::CompositionMode_SourceOut);
p.fillRect(rect, u.bg);
p.setCompositionMode(QPainter::CompositionMode_DestinationOver);
p.fillRect(rect, u.fg);
}
static QFuture<void> submitChunks(chunk_it begin, chunk_it end, store_it store) {
return QtConcurrent::run([begin, end, store]{
QPainter p(const_cast<QImage*>(&*store));
for (auto it = begin; it != end; it++)
drawChar(*it, p);
});
}
This method distributes the chunks of work between threads:
int processChunks() {
m_stores.resize(QThread::idealThreadCount());
BackingStoreView view(viewport());
if (!view.isAView(m_stores.last()))
std::generate(m_stores.begin(), m_stores.end(), [&view]{ return view.getView(); });
std::ptrdiff_t jobSize = std::max(128, (m_chunks.size() / m_stores.size())+1);
auto const cend = m_chunks.cend();
int refY = 0;
auto store = m_stores.cbegin();
for (auto it = m_chunks.cbegin(); it != cend;) {
auto end = it + std::min(cend-it, jobSize);
while (end != cend && (end->pos.y() == refY || (refY = end->pos.y(), false)))
end++; // break chunks across line boundaries
m_sync.addFuture(submitChunks(it, end, store));
it = end;
store++;
}
m_sync.waitForFinished();
m_sync.clearFutures();
m_chunks.clear();
return store - m_stores.cbegin();
}
The remainder of the implementation is uncontroversial:
protected:
void paintEvent(QPaintEvent *ev) override {
QElapsedTimer time;
time.start();
QPainter p{viewport()};
QPoint pos;
QPoint const step{xStep(), 0};
auto dividerX = m_addressChars*xStep() + m_dataMargin/2.;
p.drawLine(dividerX, 0, dividerX, viewport()->height());
int offset = 0;
QRect rRect = ev->rect();
p.end();
while (offset < m_charsPerLine*m_lines && m_dataStart + offset < m_dataSize) {
const auto address = QString::number(m_dataStart + offset, 16);
pos += step * (m_addressChars - address.size());
for (auto c : address) {
if (QRect(pos, m_glyphSize).intersects(rRect))
m_chunks.push_back({pos, &glyph(c), Qt::black, Qt::white});
pos += step;
}
pos += {m_dataMargin, 0};
auto bytes = std::min(m_dataSize - offset, (size_t)m_charsPerLine);
for (int n = bytes; n; n--) {
if (QRect(pos, m_glyphSize).intersects(rRect))
m_chunks.push_back({pos, &glyph(decode(m_data[m_dataStart + offset])), Qt::red, Qt::white});
pos += step;
offset ++;
}
pos = {0, pos.y() + yStep()};
}
int jobs = processChunks();
newStatus(QStringLiteral("%1ms n=%2").arg(time.nsecsElapsed()/1e6).arg(jobs));
}
void resizeEvent(QResizeEvent *) override {
initData();
}
void scrollContentsBy(int, int dy) override {
m_dataStart = verticalScrollBar()->value() * (size_t)m_charsPerLine;
viewport()->scroll(0, dy * m_glyphSize.height(), viewport()->rect());
}
public:
HexView(QWidget * parent = nullptr) : HexView(nullptr, 0, parent) {}
HexView(const char * data, size_t size, QWidget * parent = nullptr) :
QAbstractScrollArea{parent}, m_data(data), m_dataSize(size)
{
QRectF glyphRectF{0., 0., 1., 1.};
for (int i = 0x20; i < 0xE0; ++i)
glyphRectF = glyphRectF.united(m_fm.boundingRect(CP437[i]));
m_glyphPos = -glyphRectF.topLeft();
m_glyphSize = QSize(std::ceil(glyphRectF.width()), std::ceil(glyphRectF.height()));
initData();
}
void setData(const char * data, size_t size) {
if (data == m_data && size == m_dataSize) return;
m_data = data;
m_dataSize = size;
m_dataStart = 0;
initData();
viewport()->update();
}
Q_SIGNAL void newStatus(const QString &);
};
We leverage modern 64-bit systems and memory-map the source file to be visualized by the widget. For test purposes, a view of the character set is also available:
int main(int argc, char ** argv) {
QApplication app{argc, argv};
QFile file{app.applicationFilePath()};
if (!file.open(QIODevice::ReadOnly)) return 1;
auto *const map = (const char*)file.map(0, file.size(), QFile::MapPrivateOption);
if (!map) return 2;
QWidget ui;
QGridLayout layout{&ui};
HexView view;
QRadioButton exe{"Executable"};
QRadioButton charset{"Character Set"};
QLabel status;
layout.addWidget(&view, 0, 0, 1, 4);
layout.addWidget(&exe, 1, 0);
layout.addWidget(&charset, 1, 1);
layout.addWidget(&status, 1, 2, 1, 2);
QObject::connect(&exe, &QPushButton::clicked, [&]{
view.setData(map, (size_t)file.size());
});
QObject::connect(&charset, &QPushButton::clicked, [&]{
static std::array<char, 256> data;
std::iota(data.begin(), data.end(), char(0));
view.setData(data.data(), data.size());
});
QObject::connect(&view, &HexView::newStatus, &status, &QLabel::setText);
charset.click();
ui.resize(1000, 800);
ui.show();
return app.exec();
}
#include "main.moc"
One solution I sometimes use is to keep a cache of pre-rendered lines. I normally use a doubly-linked LRU list of entries with about twice the lines that can be seen on the screen. Every time a line is used for rendering is moved to the front of the list; when I need to create a new line and the current cache count is past the limit I reuse the last entry in the list.
By storing the final result of individual lines you can repaint the display very quickly as probably in many cases most of the lines will not change from one frame to the next (including when scrolling).
The increased complexity is also reasonably confined in having to invalidate the line when you change the content.
I have QGraphicsScene, which size is 62450x4750. Somethimes I need to make screenshot of whole scene and save it to file. I tried like this:
QPixmap wholeScene(scene.sceneRect().size().toSize());
{
QPainter wholeScenePainter(&wholeScene);
scene.render(&wholeScenePainter);
}
// saving pixmap
or
QPixmap wholeScene(scene.sceneRect().size().toSize());
{
QPainter wholeScenePainter(&wholeScene);
int x = 0;
int portion = 32768; //
while( x < scene.sceneRect().width()) {
int width = scene.sceneRect().width() - x > portion ? portion : scene.sceneRect().width() - x;
QRect rect(x, 0, width, scene.sceneRect().height());
scene.render(&wholeScenePainter, rect, rect);
x += width;
}
}
// saving pixmap
or
QPixmap wholeScene(scene.sceneRect().size().toSize());
{
QPainter wholeScenePainter(&wholeScene);
int x = 0;
int portion = 4096;
while( x < scene.sceneRect().width()) {
int width = scene.sceneRect().width() - x > portion ? portion : scene.sceneRect().width() - x;
QRect rect(x, 0, width, scene.sceneRect().height());
QPixmap temp(rect.size());
QPainter p(&temp);
scene.render(&p, QRect(0, 0, rect.width(), rect.height()), rect);
wholeScenePainter.drawPixmap(x,0, temp);
//temp.save(QString("print%1.png").arg(QString::number(x)), "PNG");
x += width;
}
}
// saving pixmap
But every time I get the Image(Pixmap) cuted by width on 32768 px.
According to the documentation, QPainter does not support coordinates larger than +/- 32768. This does not appear to be fixed in Qt 5 either.
Maybe you can solve this by rendering the scene in multiple passes, e.g. through translation and clipping. You can render the scene in multiple blocks of max 32768x32768 pixels and put them in the proper position in the final image.
I am currently trying to encode raw RGB24 images via x265. I already successfully did this with the x264 library, but a few things have changed as compared to the x265 library.
Here the problem in short: I want to convert the image I have from RGB24 to YUV 4:2:0 via the sws_scale function of FFMPEG. The prototype of the function is:
int sws_scale(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t* dst[], int dstStride[])
Assuming image contains my raw image, srcstride and `m_height' the corresponding RGB stride and height of my image, I made the following call with x264
sws_scale(convertCtx, &image, &srcstride, 0, m_height, pic_in.img.plane, pic_in.img.i_stride);
pic_in is of type x264_picture_t which looks (brief) as follows
typedef struct
{
...
x264_image_t img;
} x264_picture_t;
with x264_image_t
typedef struct
{
...
int i_stride[4];
uint8_t *plane[4];
} x264_image_t;
Now, in x265 the structures have slightly changed to
typedef struct x265_picture
{
...
void* planes[3];
int stride[3];
} x265_picture;
And I am now not quite sure how to call the same function
sws_scale(convertCtx, &image, &srcstride, 0, m_height, ????, pic_in.stride);
I tried creating a temporary array, and then copying back and recasting the array items, but it doesnt seem to work
pic.planes[i] = reinterpret_cast<void*>(tmp[i]) ;
Can someone help me out?
Thanks a lot :)
Edit
I figured it out now
outputSlice = sws_scale(convertCtx, &image, &srcstride, 0, m_height, reinterpret_cast<uint8_t**>(pic_in.planes), pic_in.stride);
This seems to do the trick :)
And btw, for other people who are experiment with x265:in x264 there was a x264_picture_alloc function which I didn't manage to find in x265. So here is a function which I used in my application and which does the trick.
void x265_picture_alloc_custom( x265_picture *pic, int csp, int width, int height, uint32_t depth) {
x265_picture_init(&mParam, pic);
pic->colorSpace = csp;
pic->bitDepth = depth;
pic->sliceType = X265_TYPE_AUTO;
uint32_t pixelbytes = depth > 8 ? 2 : 1;
uint32_t framesize = 0;
for (int i = 0; i < x265_cli_csps[csp].planes; i++)
{
uint32_t w = width >> x265_cli_csps[csp].width[i];
uint32_t h = height >> x265_cli_csps[csp].height[i];
framesize += w * h * pixelbytes;
}
pic->planes[0] = new char[framesize];
pic->planes[1] = (char*)(pic->planes[0]) + width * height * pixelbytes;
pic->planes[2] = (char*)(pic->planes[1]) + ((width * height * pixelbytes) >> 2);
pic->stride[0] = width;
pic->stride[1] = pic->stride[2] = pic->stride[0] >> 1;
}
And I am now not quite sure how to call the same function
sws_scale(convertCtx, &image, &srcstride, 0, m_height, ????,
pic_in.stride);
tried with?:
sws_scale(convertCtx, &image, &srcstride, 0, m_height, pic_in.planes,pic_in.stride);
what error do you have? have you initialized memory of x265_picture?
I am writing a video content analysis application which analyses recorded and live videos.
I use opengl to display the videos on a qt interface (using qglwidgets). I am using texture mapping with picture buffer objects if the graphics card supports it(here's the reference: http://www.songho.ca/opengl/gl_pbo.html )to display the video(loaded from OpenCV's IPLImage).
The problem is that, the memory for the application keeps on increasing over time. Approx. 4-8KB per second. I am using the task manager to verify this.
I have narrowed down the issue with the rendering of the video because I saw a lot of posts about textures not being freed which leads to memory usage but I haven't been able to find a solution for my problem.
I am only using glGenTextures in initializeGL() so the texture is being generated only once and reused.
Here's the code wherein the problem lies:
void paintGL(){
static int index = 0;
int nextIndex = 0; // pbo index used for next frame
if(paintFlag){
if(pboMode > 0) {
// "index" is used to copy pixels from a PBO to a texture object "nextIndex" is used to update pixels in a PBO
if(pboMode == 1){
// In single PBO mode, the index and nextIndex are set to 0
index = nextIndex = 0;
}
else if(pboMode == 2)
{
// In dual PBO mode, increment current index first then get the next index
index = (index + 1) % 2;
nextIndex = (index + 1) % 2;
}
// start to copy from PBO to texture object ///////
// bind the texture and PBO
glBindTexture(GL_TEXTURE_2D, texture);
glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pboIds[index]);
// copy pixels from PBO to texture object
// Use offset instead of ponter.
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, WIDTH, HEIGHT, GL_BGR, GL_UNSIGNED_BYTE, 0);
// measure the time copying data from PBO to texture object
//t1.stop();
//copyTime = t1.getElapsedTimeInMilliSec();
///////////////////////////////////////////////////
// start to modify pixel values ///////////////////
// t1.start();
// bind PBO to update pixel values
glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pboIds[nextIndex]);
// map the buffer object into client's memory
// Note that glMapBufferARB() causes sync issue.
// If GPU is working with this buffer, glMapBufferARB() will wait(stall)
// for GPU to finish its job. To avoid waiting (stall), you can call
// first glBufferDataARB() with NULL pointer before glMapBufferARB().
// If you do that, the previous data in PBO will be discarded and
// glMapBufferARB() returns a new allocated pointer immediately
// even if GPU is still working with the previous data.
glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, DATA_SIZE, 0, GL_STREAM_DRAW_ARB);
GLubyte* ptr = (GLubyte*)glMapBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, GL_WRITE_ONLY_ARB);
if(ptr)
{
// update data directly on the mapped buffer
//updatePixels(ptr, DATA_SIZE);
memcpy(ptr,original->imageData,DATA_SIZE);
glUnmapBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB); // release pointer to mapping buffer
}
// measure the time modifying the mapped buffer
//t1.stop();
//updateTime = t1.getElapsedTimeInMilliSec();
///////////////////////////////////////////////////
// it is good idea to release PBOs with ID 0 after use.
// Once bound with 0, all pixel operations behave normal ways.
glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
}
else
{
///////////////////////////////////////////////////
// start to copy pixels from system memory to textrure object
//t1.start();
glBindTexture(GL_TEXTURE_2D, texture);
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, WIDTH, HEIGHT, GL_BGR, GL_UNSIGNED_BYTE, (GLvoid*)original->imageData);
//t1.stop();
//copyTime = t1.getElapsedTimeInMilliSec();
}
paintFlag=false;
}
// clear buffer
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT);
glBegin(GL_QUADS);
glTexCoord2i(0,1); glVertex2i(0,HEIGHT);
glTexCoord2i(0,0); glVertex2i(0,0);
glTexCoord2i(1,0); glVertex2i(WIDTH,0);
glTexCoord2i(1,1); glVertex2i(WIDTH,HEIGHT);
glEnd();
glFlush();
glBindTexture(GL_TEXTURE_2D, 0);
swapBuffers();
glDeleteBuffers(1,&texture);
updateGL();
}
The code is pretty much the same as in the tutorial. However, my texture data comes from an IplImage structure which is continuously updated by a separate thread. I am also using boost's lock_guard for synchronization purposes.
Is there anything wrong that I am doing here?
EDIT: I am adding the remaining code:
//Constructor, this is where all the allocation happens
const int DATA_SIZE = WIDTH * HEIGHT * 3;
QGLCanvas::QGLCanvas(QWidget* parent,QString caption)
: QGLWidget(parent)
{
imageFormat=QImage::Format_RGB888;
this->name=caption;
original=cvCreateImage(cvSize(WIDTH,HEIGHT),IPL_DEPTH_8U,3);
if(this->name=="Background")
bgFrameBackup=cvCreateImage(cvSize(WIDTH,HEIGHT),IPL_DEPTH_8U,3);
cvZero(original);
//cvShowImage("w",original);
//cvWaitKey(0);
switch(original->nChannels) {
case 1:
format = GL_LUMINANCE;
break;
case 2:
format = GL_LUMINANCE_ALPHA;
break;
case 3:
format = GL_BGR;
break;
default:
return;
}
drawing=false;
setMouseTracking(true);
mouseX=0;mouseY=0;
startX=0; endX=0;
startY=0; endY=0;
dialog=new EntryExitRuleDialog();
makeCurrent();
GLenum result=glewInit();
if(result){
qDebug()<<(const char*)(glewGetErrorString(result));
}
//qDebug()<<"Open GL Version: "<<(const char*)glGetString(GL_VERSION);
bgColor=QColor::fromRgb(100,100,100);
initializeGL();
qglClearColor(bgColor);
glInfo glInfo;
glInfo.getInfo();
#ifdef _WIN32
// check PBO is supported by your video card
if(glInfo.isExtensionSupported("GL_ARB_pixel_buffer_object"))
{
// get pointers to GL functions
glGenBuffersARB = (PFNGLGENBUFFERSARBPROC)wglGetProcAddress("glGenBuffersARB");
glBindBufferARB = (PFNGLBINDBUFFERARBPROC)wglGetProcAddress("glBindBufferARB");
glBufferDataARB = (PFNGLBUFFERDATAARBPROC)wglGetProcAddress("glBufferDataARB");
glBufferSubDataARB = (PFNGLBUFFERSUBDATAARBPROC)wglGetProcAddress("glBufferSubDataARB");
glDeleteBuffersARB = (PFNGLDELETEBUFFERSARBPROC)wglGetProcAddress("glDeleteBuffersARB");
glGetBufferParameterivARB = (PFNGLGETBUFFERPARAMETERIVARBPROC)wglGetProcAddress("glGetBufferParameterivARB");
glMapBufferARB = (PFNGLMAPBUFFERARBPROC)wglGetProcAddress("glMapBufferARB");
glUnmapBufferARB = (PFNGLUNMAPBUFFERARBPROC)wglGetProcAddress("glUnmapBufferARB");
// check once again PBO extension
if(glGenBuffersARB && glBindBufferARB && glBufferDataARB && glBufferSubDataARB &&
glMapBufferARB && glUnmapBufferARB && glDeleteBuffersARB && glGetBufferParameterivARB)
{
pboSupported = true;
cout << "Video card supports GL_ARB_pixel_buffer_object." << endl;
}
else
{
pboSupported = false;
cout << "Video card does NOT support GL_ARB_pixel_buffer_object." << endl;
}
}
#else // for linux, do not need to get function pointers, it is up-to-date
if(glInfo.isExtensionSupported("GL_ARB_pixel_buffer_object"))
{
pboSupported = pboUsed = true;
cout << "Video card supports GL_ARB_pixel_buffer_object." << endl;
}
else
{
pboSupported = pboUsed = false;
cout << "Video card does NOT support GL_ARB_pixel_buffer_object." << endl;
}
#endif
if(pboSupported){
glGenBuffersARB(2, pboIds);
glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pboIds[0]);
glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, DATA_SIZE, 0, GL_STREAM_DRAW_ARB);
glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pboIds[1]);
glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, DATA_SIZE, 0, GL_STREAM_DRAW_ARB);
glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
//Note: pboMode=2 somehow does not work while calibration. Fix this later.
pboMode=1;
}
else{
pboMode=0;
}
paintFlag=false;
}
void QGLCanvas::setImage(IplImage image){
if(QString(this->name)=="Background"){
cvCopyImage(&image,bgFrameBackup);
}
//cvShowImage(name,&image);
// Display a rectangle between startX ,startY and endX,endY if we are in calibration mode
//and drawing flag is set.(typically, by a mouse click)
if(QString(this->name)=="Calibrate" && calibrating ){
if(drawing)
cvRectangle(&image,cvPoint(startX,startY),cvPoint(endX,endY),cvScalarAll(0xee));
if(select_object) //During calibration
cvRectangle(&image,cvPoint(selection.x,selection.y),cvPoint(selection.x+selection.width,selection.y+selection.height),cvScalarAll(0xee));
//Draw existing calibration rectangles
for (list<CvRect>::iterator it=calibration_rect_list->begin(); it!=calibration_rect_list->end(); ++it)
{
cvRectangle(&image, cvPoint((*it).x, (*it).y), cvPoint((*it).x + (*it).width, (*it).y + (*it).height), CV_RGB(100,255,0), 2, 8, 0);
}
}
//Only draw on the video widget with the name "Final"
if(QString(this->name)=="Final")
{
if(calibrating && drawing)
cvRectangle(&image,cvPoint(startX,startY),cvPoint(endX,endY),cvScalarAll(0xee));
//If we are adding a rule, the corresponding rule shape must be drawn on the widget.
if(addingRule && drawing){
if(currentShape==RULE_SHAPE_RECT){
cvRectangle(&image,cvPoint(startX,startY),cvPoint(endX,endY),cvScalarAll(0xee));
}
else if(currentShape==RULE_SHAPE_POLY){
int linecolor=0xee;
if(points.count()>0){
//Draw polygon...
for(int i=1;i<points.count();i++){
cvLine(&image,cvPoint(points[i-1]->x(),points[i-1]->y()),cvPoint(points[i]->x(),points[i]->y()),cvScalarAll(linecolor));
}
cvLine(&image,cvPoint(startX,startY),cvPoint(endX,endY),cvScalarAll(0xee));
cvLine(&image,cvPoint(endX,endY),cvPoint(points[0]->x(),points[0]->y()),cvScalarAll(linecolor));
}
}
else if(currentShape==RULE_SHAPE_TRIPLINE){
for(int i=1;i<points.count();i++){
cvLine(&image,cvPoint(points[i-1]->x(),points[i-1]->y()),cvPoint(points[i]->x(),points[i]->y()),cvScalarAll(0xee));
}
cvLine(&image,cvPoint(startX,startY),cvPoint(endX,endY),cvScalarAll(0xee));
}
}
if(entryExitRuleCreated && currentZoneType==RULE_ZONE_TYPE_ENTRY_EXIT ){
//Highlight appropriate sides of the currentRule to mark them as Entry/Exit Zone
for(int i=0;i<currentRule->points.count();i++){
QPoint* P1=currentRule->points[i];
QPoint* P2;
//Implement cyclic nature of polygon
if(i<currentRule->points.count()-1)
P2=currentRule->points[i+1];
else P2=currentRule->points[0];
int deltax=mouseX-P1->x();
int deltax1=P2->x()-P1->x();
float m,m1;
if(deltax!=0)
m= (float)(mouseY-P1->y())/deltax;
if(deltax1!=0 && deltax!=0){
m1=(float)(P2->y()-P1->y())/deltax1;
if(round(m,1)==round(m1,1))//Mouse pointer lies on the line whose slope is same as the polygon edge
{
//Mouse pointer is on the edge of a polygon, highlight the edge
if(abs(P1->y()-P2->y()) >= abs(mouseY-P2->y()) && abs(P1->y()-P2->y()) >= abs(mouseY-P1->y())
&& abs(P1->x()-P2->x()) >= abs(mouseX-P2->x()) && abs(P1->x()-P2->x()) >= abs(mouseX-P1->x())
){
edgeHighlighted=true;
highlightedEdge[0]=P1;
highlightedEdge[1]=P2;
currentEdgeNumber=i;
break;
}
}
else{
edgeHighlighted=false;
}
}
else{
//Vertical edge of a polygon.
if(abs(mouseX-P1->x())<4) { //Same vertical line
if(abs(P1->y()-P2->y()) > abs(mouseY-P2->y()) && abs(P1->y()-P2->y()) > abs(mouseY-P1->y())){
//Current y lies between the two vertices of an edge
//Mouse pointer is on the edge of polygon,highlight the edge
//qDebug()<<"P1="<<P1->x()<<","<<P1->y()<<", P2="<<P2->x()<<","<<P2->y();
edgeHighlighted=true;
highlightedEdge[0]=P1;
highlightedEdge[1]=P2;
currentEdgeNumber=i;
break;
}
else
edgeHighlighted=false;
}
}
}
if(edgeHighlighted || edgeHighlightedFromButton){
cvLine(&image,cvPoint(highlightedEdge[0]->x(),highlightedEdge[0]->y()),cvPoint(highlightedEdge[1]->x(),highlightedEdge[1]->y()),cvScalar(0xff,0x00,0x00),3);
}
}
}
{
//qDebug()<<name<<":Saving original image";
ExclusiveLock xlock(globalXMutex);
this->original=ℑ
paintFlag=true;
}
updateGL();
/*if(this->name=="Final"){
cvShowImage("Final",original);
cvWaitKey(1);
}*/
}
//Texture is generated here
void QGLCanvas::initializeGL(){
glDisable(GL_LIGHTING);
glEnable(GL_TEXTURE_2D);
glClearColor(0, 0, 0, 0); // background color
glClearStencil(0); // clear stencil buffer
glClearDepth(1.0f); // 0 is near, 1 is far
glDepthFunc(GL_LEQUAL);
glEnable(GL_TEXTURE_2D);
glGenTextures(1,&texture);
glBindTexture(GL_TEXTURE_2D,texture);
glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP);
glBindTexture(GL_TEXTURE_2D,texture);
glTexImage2D(GL_TEXTURE_2D,0,GL_RGB,WIDTH,HEIGHT,0,GL_BGR,GL_UNSIGNED_BYTE,NULL);
glBindTexture(GL_TEXTURE_2D, 0);
glClearStencil(0); // clear stencil buffer
glClearDepth(1.0f); // 0 is near, 1 is far
glDepthFunc(GL_LEQUAL);
setAutoBufferSwap(false);
}
void QGLCanvas::resizeGL(int width,int height){
if (height==0) // Prevent A Divide By Zero By
{
height=1; // Making Height Equal One
}
glViewport(0,0,WIDTH,HEIGHT); // Reset The Current Viewport
glMatrixMode(GL_PROJECTION); // Select The Projection Matrix
glLoadIdentity(); // Reset The Projection Matrix
glOrtho(0.0f,WIDTH,HEIGHT,0.0f,0.0f,1.0f);
glEnable(GL_TEXTURE_2D);
glMatrixMode(GL_MODELVIEW); // Select The Modelview Matrix
glLoadIdentity(); // Reset The Modelview Matrix
}
You're calling glDeleteBuffers() on a texture object (should be buffer object), or rather, should not be here at all I think. Like other GL objects, only glDelete() once for every glGen() call.
You're calling glFlush() and swapBuffers(), I believe Qt takes care of that for you.
The OpenGL driver could have a memory leak. Try it without PBO.
Try glGetError() after each GL call to see if you've made a mistake elsewhere.