We need to scrape VEEC Website for the total number once a week.
As an example, for the week of 17/10/2016 - 23/10/2016 the URL returns the number Total 167,356 when the search button is clicked. We want this number to be stored in our database.
I'm using coldfusion to generate the weekly dates as params and have been passing them like the above URL. But I'm unable to find a query param so that the "Search" button click event is triggered.
I've tried like this & this but nothing seems to be working.
Any pointers?
It seems like for every form submission, a CRSF token is added, which prevents malicious activity. To make matters worse for you, the CRSF token is changed for each form submission, not just for each user, which makes it virtually impossible to circumvent.
When I make a CFHTTP POST request to this form, I get HTML FileContent back, but there is no DB data within the results table cell placeholders. It seems to me that the form owner allows form submission from an HTTP request, but if the CRSF token cannot be validated, no DB data is returned.
It maybe worth asking the website owner, if there is any kind of REST API, that you can hook into...
If you want to use a headless browser PhantomJS (https://en.wikipedia.org/wiki/PhantomJS) for this, here is a script that will save the total to a text file.
At command prompt, after you install PhantomJS, run phantomjs.exe main.js.
main.js
"use strict";
var firstLoad = true;
var url = 'https://www.veet.vic.gov.au/Public/PublicRegister/Search.aspx?CreatedFrom=17%2F10%2F2016&CreatedTo=23%2F10%2F2016';
var page = require("webpage").create();
page.viewportSize = {
width: 1280,
height: 800
};
page.onCallback = function (result) {
var fs = require('fs');
fs.write('veet.txt', result, 'w');
};
page.onLoadStarted = function () {
console.log("page.onLoadStarted, firstLoad", firstLoad);
};
page.onLoadFinished = function () {
console.log("page.onLoadFinished, firstLoad", firstLoad);
if (firstLoad) {
firstLoad = false;
page.evaluate(function () {
var event = document.createEvent("MouseEvents");
event.initEvent("click", true, true);
document.querySelectorAll(".dx-vam")[3].dispatchEvent(event);
});
} else {
page.evaluate(function () {
var element = document.querySelectorAll('.dxgv')[130];
window.callPhantom(element.textContent);
});
setTimeout(function () {
page.render('veet.png');
phantom.exit();
}, 3000);
}
};
page.open(url);
The script is not perfect, you can work on it if you're interested, but as is it will save the total to a file veet.txt and also save a screenshot veet.png.
Related
Is there a way to call an external API Endpoint on Google Forms every time the form is filled out?
First:
you'll need to set up your App script project and you'll do that by:
Visit script.google.com to open the script editor. (You'll need to be signed in to your Google account.) If this is the first time you've been to script.google.com, you'll be redirected to a page that introduces Apps Script. Click Start Scripting to proceed to the script editor.
A welcome screen will ask what kind of script you want to create. Click Blank Project or Close.
Delete any code in the script editor and paste in the code below.
This video and the doc will help
Second
you'll need to create an installable trigger, you can add it to the form directly or to the spreadsheet that has the responses
function setUpTrigger(){
ScriptApp.newTrigger('sendPostRequest') /* this has the name of the function that will have the post request */
.forForm('formkey') // you'll find it in the url
.onFormSubmit()
.create();
}
Check the doc
Third
create the sendPostRequest function and add the UrlFetchApp to it
function sendPostRequest(e){
// Make a POST request with form data.
var resumeBlob = Utilities.newBlob('Hire me!', 'text/plain', 'resume.txt');
var formData = {
'name': 'Bob Smith',
'email': 'bob#example.com',
'resume': resumeBlob
};
// Because payload is a JavaScript object, it is interpreted as
// as form data. (No need to specify contentType; it automatically
// defaults to either 'application/x-www-form-urlencoded'
// or 'multipart/form-data')
var options = {
'method' : 'post',
'payload' : formData
};
UrlFetchApp.fetch('https://httpbin.org/post', options);
}
Check the doc
Try something like this in your app script:
var POST_URL = "enter your webhook URL";
function onSubmit(e) {
var form = FormApp.getActiveForm();
var allResponses = form.getResponses();
var latestResponse = allResponses[allResponses.length - 1];
var response = latestResponse.getItemResponses();
var payload = {};
for (var i = 0; i < response.length; i++) {
var question = response[i].getItem().getTitle();
var answer = response[i].getResponse();
payload[question] = answer;
}
var options = {
"method": "post",
"contentType": "application/json",
"payload": JSON.stringify(payload)
};
UrlFetchApp.fetch(POST_URL, options);
};
Be sure to replace the POST_URL variable with your webhook, you can use requestcatcher.com to test this out.
Add a trigger to the script by clicking "Triggers" in the side menu
Open the menu (top-right dots)
Click in Script Editor
Paste the above code (changing the POST_URL)
Click in the clock icon (left-side menu), which means Triggers.
On the right-bottom corner, click in the blue Add trigger button (a form will show as the image below).
It should show onSubmit under Choose which function to run.
Make sure Select event type is set as On form submit.
Click Save button.
After that, submit your form and watch for the request to come in.
This is pretty straightforward with Google Scripts.
Just create a new project bound to your spreadsheet and create 2 elements:
A function that will contain all relevant data to make the call (see docs for making a HTTP request from Google Apps Script)
A trigger linked to the spreadsheet. You can set it to run each time an edit occurs or form is submitted
Voilà, your sheet will call whatever endpoint you wish on submission. You can even parse the spreadsheet to return that data to your endpoint
I have angular-meteor app that needs Material md-autocomplete from a collection with 53,296 documents with angularUtils.directives.dirPagination but this amount of data make my browser hang.
I'm publishing the collection with:
Meteor.publish('city', function (options, searchString) {
var where = {
'city_name': {
'$regex': '.*' + (searchString || '') + '.*' ,
'$options': 'i'
}
};
return City.find(where, options);
});
I subscribe with:
subscriptions: function () {
Meteor.subscribe('city');
this.register('city', Meteor.subscribe('city'));
}
and have pagination on controller :
$scope.currentPage = 1;
$scope.pageSize = 100;
$scope.sort = {city_name_sort : 1};
$scope.orderProperty = '1';
$scope.helpers({
city: function(){
return City.find({});
}
});
but it takes a long time to load and its make chrome stop working.
You already have most of the server-side searching done because your search is running inside a subscription. You should make sure that the city_name field is indexed in mongo! You should only return that field to minimize data transfer. You can also simplify your regex.
Meteor.publish('city', function (searchString) {
const re = new RegExp(searchString,'i');
const where = { city_name: { $regex: re }};
return City.find(where, {sort: {city_name: 1}, fields: {city_name: 1}});
});
What I've found helps with server-side auto-complete is:
Don't start searching until the user has typed 3 or 4 characters. This drastically narrows down the search results.
Throttle the search to only run every 500ms so that you're not sending every character to the server because then it has to keep re-executing the search. If the person is typing fast the search might only run every 2 or 3 characters.
Run the same .find() on the client that you're running on the server (instead of just querying for {}). That's just good practice since the client-side collection is the union of all subscriptions on that collection, there might be documents there that you don't want to list.
Lastly I don't know why you're subscribing twice here:
subscriptions: function () {
Meteor.subscribe('city');
this.register('city', Meteor.subscribe('city'));
}
only one of those Meteor.subscribe('city') calls is necessary.
I have global array that works just fine and stores the URL's of the chosen images from the user after i click submit in the form.
the problem is when i want to submit another form, the global array will still have the URL's of the previous submission.
what i want to do is to create an array for every user to store his URL's, one he click submit, the array will be dropped or deleted. so if there were multiple users using the same function, every one of them will have his own array to store his URL's
How do i do this?
this is what i have tried but when i click on submit on the form page, nothing happens
first, this is the method that returns the url of the chosen image by the user, the method exists in both folder (both/file.js)
storeUrlInDatabaseSS: function( url ) {
check( url, String );
Modules.both.checkUrlValidity( url );
try {
return url;
} catch( exception ) {
return exception;
}
}
then i created the session variables in the client side (client/file.js)
Session.set("screenshots", []);
Session.set("i", 0);
var screenshots = Session.get("screenshots");
var i = Session.get("i");
and here i store the url in the array
let _addUrlToDatabaseSS = ( url ) => {
screenshots[i++] = url;
Session.set("screenshots", screenshots);
};
and am using Meteor Collection Hook Package
and i added these two lines of code which should be excited after the user press submit, they exist inside "client/files.js" directory
Products.before.insert(function (userId, doc) {
doc.screenShots = Session.get("screenshots");
});
now whenever i click submit nothing happens, i think the problem is because nothing is actually stored inside the screenShots attribute in the collection here
screenShots: {
type: [String]
},
when i set the screenShots attribute to an empty array by default like the code below, the submit button works
screenShots: {
type: [String],
autoValue: function() {
return [];
}
},
I tried to use the other way of using AutoForm.hooks
AutoForm.hooks({
submitPostForm: {
before: {
insert: function(doc) {
doc.$set.screenShots = Session.get("screenshots");
}
}
}
});
the is my form in the .html file
{{> quickForm collection="Products" id="submitPostForm"
type="method" meteormethod="submitPost" omitFields="createdAt, previewImage, screenShots, sourceCode, userID"}}
and this is the method triggered once the user submit the form, it exist in the server side.
submitPost: function (app) {
// Console.log('new App:', app);
check(app, {
title: String,
description: String,
category: String,
price: Number
});
Products.insert(app);
}
for some reason my before hook isn't working and i can't see why!
what am i doing wrong here?
One of the ways to create a global array per user is to use Session. This way it is also possible to persist the data across the app (only client-side).
Simple way to use Session is thus:
Create an array in Session called url_list:
Session.set("url_list", []);
Retrieve the array from Session:
var url_list = Session.get("url_list");
Make changes to url_list:
url_list.push(someData);
Store url_list in the Session again:
Session.set("url_list", url_list);
Note: Session can only be used on client-side and all related code should be on the client-side.
More about Session.
PERSISTING DATA TO SERVER-SIDE:
The best way to persist the url_list to the server, would be to insert a new document into the database collection containing the Session data.
insertToDB = function() {
var url_list = Session.get('url_list');
Products.insert({
'url_list': url_list
});
Session.set('url_list', []); // To empty the client-side list
}
There are sites whose DOM and contents are generated dynamically when the page loads. (Angularjs-based sites are notorious for this)
What approach do you use?
I tried both phantomjs and jsdom but it seems I am unable get the page to execute its javascript before I scrape.
Here's a simple jsdom example (not angularjs-based but still dynamically generated)
var env = require('jsdom').env;
exports.scrape = function(link, callback) {
var config = {
url: link,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36'
},
done: jsdomDone
};
env(config);
}
function jsdomDone(err, window) {
var info = null;
if(err) {
console.error(err);
} else {
var $ = require('jquery')(window);
console.log($('.profilePic').attr('src'));
}
}
exports.scrape('https://www.facebook.com/elcompanies');
I tried phantomjs with moderate success.
var page = new WebPage()
var fs = require('fs');
page.onLoadFinished = function() {
console.log("page load finished");
window.setTimeout(function() {
page.render('export.png');
fs.write('1.html', page.content, 'w');
phantom.exit();
}, 10000);
};
page.open("https://www.facebook.com/elcompanies", function() {
page.evaluate(function() {
});
});
Here I wait for the onLoadFinished event and even put a 10-second timer. The interesting thing is that while my export.png image capture of the page shows a fully rendered page, my 1.html doesn't show the .profilePic class element in its rightful place. It seems to be sitting in some javascript code, surrounded by some kind of "require("TimeSlice").guard(function() {bigPipe.onPageletArrive({..." block
If you can provide me a working example that scrapes the image off this page, that'd be helpful.
I've done some scraping in Facebook by using nightmarejs.
Here is a code that I did to get some content from some posts of a Facebook page.
module.exports = function checkFacebook(callback) {
var nightmare = Nightmare();
Promise.resolve(nightmare
.viewport(1000, 1000)
.goto('https://www.facebook.com/login/')
.wait(2000)
.evaluate(function(){
document.querySelector('input[id="email"]').value = facebookEmail
document.querySelector('input[id="pass"]').value = facebookPwd
return true
})
.click('#loginbutton input')
.wait(1000)
.goto('https://www.facebook.com/groups/bierconomia')
.evaluate(function(){
var posts = document.getElementsByClassName('_1dwg')
var length = posts.length
var postsContent = []
for(var i = 0; i < length; i++){
var pTag = posts[i].getElementsByTagName('p')
postsContent.push({
content: pTag[0] ? pTag[0].innerText : '',
productLink: posts[i].querySelector('a[rel = "nofollow"]') ? posts[i].querySelector('a[rel = "nofollow"]').href : '',
photo: posts[i].getElementsByClassName('_46-i img')[0] ? posts[i].getElementsByClassName('_46-i img')[0].src : ''
})
}
return postsContent
}))
.then(function(results){
log(results)
return new Promise(function(resolve, reject) {
var leanLinks = results.map(function(result){
return {
post: {
content: result.content,
productLink: extractLinkFromFb(result.productLink),
photo: result.photo
}
}
})
resolve(leanLinks)
})
})
The thing that I find useful with nightmare is that you can use the wait function to either wait for X ms or for a specific class to render.
This is because generated web pages based on AJAX calls have asynchronous AJAX calls and you can't rely on onLoad events (because data still not available).
In my personal opinion, the most reliable way would be tracing which REST services are being called from this HTML and make direct calls to them. Sometimes you will need using values found in HTML or values taken from another calls.
I know this may sound complicated, and in fact it is. You kinda need to debug page and learn what is being called. But this will work for sure.
By the way, using chrome developer tools will help this task. Just observe which call are made in network tab. You can even observe what has been sent and received in each AJAX call.
If it is a one time thing, that is, if I just want to scrape a single page once, I just use the browser and artoo-js.
I never tried to write a page on disk using phantom, but I have two observations:
1) you are using fs.write to write things to disk, but writeFile is an async call. This means that you either need to change it to fs.writeFileSync or use a callback before closing phantom.
2) I hope you aren't expecting to write a HTML to a file and open it in a browser and get it rendered like when you saved a png, because it doesnt work this way. Some objects can be stored directly in DOM properties and certainly there are values stored in javascript variables, those things will never be persisted.
i have been knocking my head for 2 days now in that .
am creating a search engine, am creating queries dynamically using Meteor Framwork, the queries are working fine and when i search i can rebind the UI (Table in My Case) with the dynamic data query output.
however if an insert/update/delete operation occures the data object
and the UI (html Table) is not updating.
which means that the template is not re-rendered when the data object changes.
Template.search.rendered = function () {
Meteor.autorun(function() {
alarmsData = Alarms.find(getSearchSelector($('#searchTxt').val(), $('#startTimeTxt').val(), $('#endTimeTxt').val())).fetch()
console.log("rendered")
//alarmsData = Alarms.find({},{sort: {timestamp: "desc"} }).fetch();
searchControls(alarmsData)
getConsole(alarmsData, ".console")
$('#badge').html(alarmsData.length)
})
}
the get console function is just reading the array from teh search and creating an html table (this is working fine)
as for the begining i am creating a simple query as the default for my search. and then am changing this query whenever user changes the search criteria. i can notice that only the first instance of teh data object is kept and tracked for changes, so if the second search criteria resides within the first one, it's updating the UI, if not nothing happenes
i have used Meteor.autorun(function(){}) function however i traced it's execution with console.log and i can see it's no excuting when i insert data in the database for the same collection.
One, I believe you are trying to use Deps.autorun. Also, there is nothing in your autorun that seems to be dependent on a reactive source. Since alarmsData is taking a snapshot of data it won't care when Alarms has data changing.
Second, I would probably approach this with a redirect. I would compile my data, and redirect to the same page, allowing the server to handle the querying for me. This easily allows you to jump to this page from anywhere else with a prefilled query in the parameters (because the route would then handle it) and also gives a visual change to the navigation bar when a search has happened (just like every other search engine). You would do something like this on a button click:
var query = {},
path;
query.text = encodeURIComponent($('#searchTxt').val()),
query.start = encodeURIComponent($('#startTimeTxt').val()),
query.end = encodeURIComponent($('#endTimeTxt').val()),
// redirect to current path
path = Router.routes[Router.current().route.name].path({}, {
query: query
});
Router.go( path );
In your router you would just pass the query into your server and route as a data object (assuming you are using iron-router):
this.route( "search", {
path: "/search",
waitOn: function() {
return [
Meteor.subscribe( "searchAlarms", _.omit( this.params, "hash" ) ),
]
},
data: function () {
return { "query": _.omit( this.params, "hash" ) };
}
});
This will not only give you the query data that was used for the search (in your template) but the server can now handle the search for you! Your Alarms data now holds all of the documents needed to display to the user and you no longer need to subscribe to all your Alarms. This is also great because it is automatically reactive. So if a new Alarm matches your query filter it will automatically be passed down to the client and displayed to the user without needing to setup any extra dependencies/autoruns.
Note though, that if you are subscribing to Alarms elsewhere you will still need to do filtering client-side.
What a strange meteor code…
The "rendered" code method code is called once you will be rendering the search template
getSearchSelector($('#searchTxt').val() is not reactive, my advise is to use the session variable to put your search criteria inside and use this same session to inject the find parameters inside.
Are you looking for displaying all the alarms Data ?
function getAlarms()
{
var text = Session.get("text");
var from = Session.get("start");
var to = Session.get("end");
var filter = getSearchSelector(text, from, to);
return Alarms.find(filter);
}
Template.search.alarms = function () {
return getAlarms();
}
Template.search.alarmsCount = function () {
return getAlarms().count();
}
Template.search.events({
'keypress input[name=text]' : function(e,o)
{
var val = $("input[name= text]").val()
Session.set("text", val);
},
'keypress input[name=start]' : function(e,o)
{
var val = $("input[name=start]").val()
Session.set("start", val);
},
'keypress input[name=end]' : function(e,o)
{
var val = $("input[name=end]").val()
Session.set("end", val);
}
});
// And your template will look something like:
<template name="search">
Search alarms
<input type="text" name="text" placeholder="Enter your text here…"/>
<input type="text" name="start" placeholder="start time"/>
<input type="text" name="end" placeholder="end time/>
There is {{alarmsCount}} alarms(s);
{{#each alarms}}
Alarm object: {{.}}
{{/each}}
</template>
I Guess its Solved it by using Session.set & get, and automatically subscribing to the Serevr and send the dynamic Query.
Check the below Code
Template.zConsole.rendered = function () {
Session.set("obj", getSearchSelector($('#searchTxt').val(), $('#startTimeTxt').val(), $('#endTimeTxt').val()))
Deps.autorun(function (){
Meteor.subscribe("dynamicAlarms", Session.get("obj"))
console.log("Count from AutoRun ==> " + Alarms.find(Session.get("obj")).count())
})
}
on the server
Meteor.publish('dynamicAlarms',function (searchObj) {
return Alarms.find(searchObj)
})
& it works perfect with less code.