Saving scraped data to mysql database through phantomjs and casperjs - web-scraping

Hi I am working on a web scraper, first I was trying to scrape using php CURL, but then I faced a problem that I wasn't able to scrape the sites which loads through AJAX and then I shifted to 'phantom JS' and 'casper JS`.
Now I have successfully installed the webkit and can scrape the data from any website, but I am unable to save the data for long use in a database. Simply, for later use. What I want to do is, whatever data I have scraped I want to save that to mySql database.
Is there any way I can achieve such functionality? I have tried sending Ajax request to send the data to the database but failed.
I came up with one another solution for instance, that is when I scrape the data from the specified website, I push the data to an array called data[] and then I write that data to a .json file. Where each bunch of data is saved in array of objects form which is get from JSON.stringify(data).
Now, I don't know if how can I get that file data and save it in database? Is it possible that, whenever the scraping is finished, right after I grab data from that .json file and save it to database.
For now just take this code as an example
var casper = require('casper').create();
var file = require('fs');
var data = [];
casper.start('http://casperjs.org/', function() {
data.push(this.getTitle());
file.write("file.json", JSON.stringify(data), "a");
});
casper.run();

A Proof Of Concept, using jq :
#!/bin/bash
casperjs script.js
[[ -s file.json ]] || exit 1
jq '"UPDATE ROW SET XXX = "+ .[] + " WHERE FOO=BAR;"' file.json | mysql -D DB_name
The file.json :
[
"foo",
"bar",
"base"
]
jq output :
jq -r '"UPDATE ROW SET XXX = "+ .[] + " WHERE FOO=BAR;"' file.json
UPDATE ROW SET XXX = foo WHERE FOO=BAR;
UPDATE ROW SET XXX = bar WHERE FOO=BAR;
UPDATE ROW SET XXX = base WHERE FOO=BAR;
Check https://stedolan.github.io/jq/

Simple solution I found is to make ajax request to the server, inside the evaluate function :
casper.then(function() {
details = this.evaluate(function() {
var elDet = document.getElementsByClassName("job-description-column")[0];
var detLen = elDet.children[2].children[0].children.length;
var details = elDet.children[2].children[0].children;
var linkedData = [];
for (var i = 0; i < detLen; i++) {
if (details[i].nodeName == "H3" && details[i].id != "if-this-sounds-like-you,-apply") {
linkedData.push({
head: details[i].textContent,
description: details[i + 1].textContent,
title: elDet.children[0].children[0].children[0].textContent,
loc: elDet.children[0].children[0].children[1].textContent,
date: elDet.children[0].children[0].children[2].textContent
})
i++;
} else {
linkedData.push({
head: "No Head",
description: details[i].textContent,
title: elDet.children[0].children[0].children[0].textContent,
loc: elDet.children[0].children[0].children[1].textContent,
date: elDet.children[0].children[0].children[2].textContent
})
}
}
var s = JSON.stringify(linkedData);
console.log(linkedData);
$.ajax({
method: "POST",
url: "http://localhost/fiverr/Crawl%20The%20Jobs/modal_scripts.php",
data: "add_jobdets=true&job_details=" + s,
async: false
})
return linkedData;
})
})

Related

how to skip blank values in http post body in Jmeter while reading data from csv?

Below is my sample body in jmeter. I tried using the JSR223 pre processor with below code but its removing the blank values only for the first csv row. How do i remove all blank values coming from my csv file? wham am i doing wrong here? Any help is appreciated.
**[def request = new groovy.json.JsonSlurper().parseText(sampler.getArguments().getArgument(0).getValue())
def newRequest = evaluate(request.inspect())
request.each { entry ->
if (entry.getValue().equals('')) {
newRequest.remove(entry.getKey())
}
}
sampler.getArguments().removeAllArguments()
sampler.addNonEncodedArgument('', new groovy.json.JsonBuilder(newRequest).toPrettyString(), '')
sampler.setPostBodyRaw(true)][1]**
http body:
{
"number": "${number}",
"marker": "${market}",
"description": "${description}"
}
If you have more than 1 iteration in Thread Group or Loop Controller you will need to revisit your approach to removing and adding back the request body.
Suggested code change:
def data = new org.apache.jmeter.config.Arguments()
def request = new groovy.json.JsonSlurper().parseText(sampler.getArguments().getArgument(0).getValue())
def newRequest = evaluate(request.inspect())
request.each { entry ->
if (entry.getValue().equals('')) {
newRequest.remove(entry.getKey())
}
}
def body = new org.apache.jmeter.protocol.http.util.HTTPArgument('', new groovy.json.JsonBuilder(newRequest).toPrettyString(), '', false)
body.setAlwaysEncoded(false)
data.addArgument(body)
sampler.setArguments(data)
More information:
HTTPSamplerBase source code
Apache Groovy - Why and How You Should Use It

Unable to convert BLOB to String with Cordova sqlite plugin for offline maps

I'm following this tutorial : https://kuamoto.wordpress.com/2016/02/26/myth-1-cant-make-offline-apps/ to get offline maps in an ionic 2 app based on a sqlite database (.mbtiles). I use Cordova Sqlite plugin to query the database as shown in this repo : https://github.com/wilblack/offline-map-example
The database contains Blob corresponding to a combinaison of x, y and z which come for the map location I want to display.
I've succeeded to open the database and to query it but I got stuck with the following error :
unknown error (Sqlite code 0): Unable to convert BLOB to string, (OS error - 2:No such file or directory)"
It seems to be a common issue, but I only found solutions for Android directly. Here is my Typescript code:
getTileUrl: function(tilePoint, zoom, tile) {
var z = this._getZoomForUrl();
z = Math.round(z);
var x = tilePoint.x;
var y = tilePoint.y;
y = Math.pow(2, z) - y - 1;
var base64Prefix = 'data:image/gif;base64,';
this.mbTilesDB.transaction((tx) => {
tx.executeSql("SELECT tile_data FROM tiles WHERE zoom_level = ? AND tile_column = ? AND tile_row = ?;", [z, x, y], (tx, res) => {
//Never get here
tile.src = base64Prefix + res.rows.item(0).tile_data;
}, (err, msg) => {
console.log('[MapPage.getTileUrl] error with executeSql', err);
});
}, (err, msg) => {
console.log("[MapPage.getTileUrl] Transaction err:", err);
});
},
_loadTile: function(tile, tilePoint, zoom) {
tile._layer = this;
tile.onload = this._tileOnLoad;
tile.onerror = this._tileOnError;
this.getTileUrl(tilePoint, zoom, tile);
}
The code breaks just after launching the query. If I launch the query inside a Database browser, I obtain a result as a Blob.
The issue is closed to this one: How to set tile_data from .mbtiles file as tiles for leaflet? except that I don't even get a result from the query
Thank you for your help
As far as I know, most JavaScript implementations do not support blobs.
That code assumes that the data in the database is stored as Base64-encoded text (or that the database driver does this conversion automatically).
You have to tell the database to convert the blob into some text format (SELECT hex(tile_data) ...), and then convert that hex string into some useful format in your code.
Cordova-sqlite-ext supports reading BLOBs from pre-populated sqlite databases like .mbtiles-files.
In order to build data-URLs containing tile data you can adapt the following code taken from the README:
```
db.readTransaction(function(tx) {
tx.executeSql("SELECT BASE64(data) AS base64_data FROM MyTable", [], function(tx, resultSet) {
console.log('BLOB data (base64): ' + resultSet.rows.item(0).base64_data);
});
});
```

Partially update a session variable in meteor?

I think that I might be missing something. I've got the entire contents of a user's product order form stored in a session variable.
var orderFormContents = {
numDoors: 4
numWheels: 4
numSeats: 5
};
Session.set("orderFormContentsSessionVar", orderFormContents);
How do I update the value of just one key in orderFormContentsSessionVar, for instance, just numDoors?
I don't want to overwrite the entire existing contents of the session var.
I would love to be able to do something like:
Session.set("orderFormContentsSessionVar.numDoors", 2);
Something equivalent to _.extend
UPDATE
Following the example of the answer below, I just wrote a function for it:
var updateSession = function(sessionVarName, updateParams){
var obj = Session.get(sessionVarName);
_.extend(obj, updateParams);
Session.set(sessionVarName, obj);
console.log("updated session name: ", sessionVarName, "new session contents: ", Session.get(sessionVarName));
};
Using like you proposed:
var obj = Session.get("orderFormContentSessionVar");
Session.set("orderFormContentsSessionVar", _.extend(obj, {numDoors: 2}));
_.extend should work just fine.
Session.set( "orderFormContentsSessionVar",
_.extend(Session.get("orderFormContentsSessionVar"), { numSeats: 10 }) )
I doubt there is any other way.

Why should I ensure a unique timestamp in Meteor?

I'm learning meteor and reading the todos example:
$ curl https://install.meteor.com | sh
$ meteor create --example todos
In todos/server/bootstrap.js around line 50 there are lines as follows.
var timestamp = (new Date()).getTime();
for (var i = 0; i < data.length; i++) {
var list_id = Lists.insert({name: data[i].name});
for (var j = 0; j < data[i].contents.length; j++) {
var info = data[i].contents[j];
Todos.insert({list_id: list_id,
text: info[0],
timestamp: timestamp,
tags: info.slice(1)});
timestamp += 1; // ensure unique timestamp.
}
}
I wonder why do we need to ensure the timestamp is unique? Is it required for meteor/mongodb or just application-specific?
The timestamp isn't required or doesn't have to be unique, its application specific in this case.
The use case here is just so that each todo item has an sequential timestamp so that it displays it in the correct order as can be seen in the client js, oldest at the top and newest at the bottom.
Template.todos.todos = function () {
...
return Todos.find(sel, {sort: {timestamp: 1}});
};
The server js uses adds 1 microsecond to each item in the loop to simulate the sort order that would perhaps be put in with with an actual user. Its only done once to create an example database.
If there was no sort order/sequential timestamps the todos would be sorted by the records _id, which would make their order at best, random. The reason for this is meteor uses Meteor.uuid() to generate its _id fields which are random & without sequentiality.

How to check for the existence of an IIS 7 web site via WiX 3.5?

Note: This question can also be found on the WiX mailing list.
I need to be able to check for the existence of an IIS7 website based on the website's description. If the website does not exist I need to cancel the installation. If the website exists I want to continue the installation. I also need to be able to save the site id of the website so that I may use it during an uninstall.
For debugging purposes I have hard coded the website's description. I do not see any indication that a check for the website is being made within the MSI log file. This is the code I am using:
<iis:WebSite Id="IISWEBSITE" Description="Default Web Site" SiteId="*">
<iis:WebAddress Id="IisWebAddress" Port="1"/>
</iis:WebSite>
<Condition Message="Website [IISWEBSITE] not found.">
<![CDATA[IISWEBSITE]]>
</Condition>
Using ORCA I can see that IIsWebAddress and IIsWebSite tables are added to the MSI. The values are:
IIsWebsite
WEB: IISWEBSITE
Description: Default Web Site
KeyAddress: IisWebAddress
Id: -1
IIsWebAddress
Address: IisWebAddress
Web_: IISWEBSITE
Port: 1
Secure: 0
With the above code, the installation is halted with the error message "Website not found". It appears that IISWEBSITE is never getting set. Though, I know that "Default Web Site" exists. I know that I must be missing something, but what?
How can I perform a simple check for the existence of a website in IIS 7?
I too had same problem.
I wrote a custom action to check the version of IIS from registry.
On the basis of registry value create virtual directory
I wrote a custom action in Javascript to do this. If you are assuming IIS7, then you can use the appcmd.exe tool, and just invoke it from within Javascript to get the list of sites. In theory, it's pretty simple to do. But in practice, there's a bunch of hoops you need to jump through.
Here's what I came up with:
function RunAppCmd(command, deleteOutput) {
var shell = new ActiveXObject("WScript.Shell"),
fso = new ActiveXObject("Scripting.FileSystemObject"),
tmpdir = fso.GetSpecialFolder(SpecialFolders.TemporaryFolder),
tmpFileName = fso.BuildPath(tmpdir, fso.GetTempName()),
windir = fso.GetSpecialFolder(SpecialFolders.WindowsFolder),
appcmd = fso.BuildPath(windir,"system32\\inetsrv\\appcmd.exe") + " " + command,
rc;
deleteOutput = deleteOutput || false;
LogMessage("shell.Run("+appcmd+")");
// use cmd.exe to redirect the output
rc = shell.Run("%comspec% /c " + appcmd + "> " + tmpFileName, WindowStyle.Hidden, true);
LogMessage("shell.Run rc = " + rc);
if (deleteOutput) {
fso.DeleteFile(tmpFileName);
}
return {
rc : rc,
outputfile : (deleteOutput) ? null : tmpFileName
};
}
// GetWebSites_Appcmd()
//
// Gets website info using Appcmd.exe, only on IIS7+ .
//
// The return value is an array of JS objects, one per site.
//
function GetWebSites_Appcmd() {
var r, fso, textStream, sites, oneLine, record,
ParseOneLine = function(oneLine) {
// split the string: capture quoted strings, or a string surrounded
// by parens, or lastly, tokens separated by spaces,
var tokens = oneLine.match(/"[^"]+"|\(.+\)|[^ ]+/g),
// split the 3rd string: it is a set of properties separated by colons
props = tokens[2].slice(1,-1),
t2 = props.match(/\w+:.+?(?=,\w+:|$)/g),
bindingsString = t2[1],
ix1 = bindingsString.indexOf(':'),
t3 = bindingsString.substring(ix1+1).split(','),
L1 = t3.length,
bindings = {}, i, split, obj, p2;
for (i=0; i<L1; i++) {
split = t3[i].split('/');
obj = {};
if (split[0] == "net.tcp") {
p2 = split[1].split(':');
obj.port = p2[0];
}
else if (split[0] == "net.pipe") {
p2 = split[1].split(':');
obj.other = p2[0];
}
else if (split[0] == "http") {
p2 = split[1].split(':');
obj.ip = p2[0];
if (p2[1]) {
obj.port = p2[1];
}
obj.hostname = "";
}
else {
p2 = split[1].split(':');
obj.hostname = p2[0];
if (p2[1]) {
obj.port = p2[1];
}
}
bindings[split[0]] = obj;
}
// return the object describing the website
return {
id : t2[0].split(':')[1],
name : "W3SVC/" + t2[0].split(':')[1],
description : tokens[1].slice(1,-1),
bindings : bindings,
state : t2[2].split(':')[1] // started or not
};
};
LogMessage("GetWebSites_Appcmd() ENTER");
r = RunAppCmd("list sites");
if (r.rc !== 0) {
// 0x80004005 == E_FAIL
throw new Exception("ApplicationException", "exec appcmd.exe returned nonzero rc ("+r.rc+")", 0x80004005);
}
fso = new ActiveXObject("Scripting.FileSystemObject");
textStream = fso.OpenTextFile(r.outputfile, OpenMode.ForReading);
sites = [];
// Read from the file and parse the results.
while (!textStream.AtEndOfStream) {
oneLine = textStream.ReadLine();
record = ParseOneLine(oneLine);
LogMessage(" site: " + record.name);
sites.push(record);
}
textStream.Close();
fso.DeleteFile(r.outputfile);
LogMessage("GetWebSites_Appcmd() EXIT");
return sites;
}

Resources