Puppeteer / Node - Target.createTarget - Target Closed - web-scraping

I'm using Node/Puppeteer in the code below, passing in a large list of URL's for traversal and scraping. It has been difficult to do it asynchronously, though I find that I am getting closer and closer to the answer. I am currently stuck on an issue related to the following error.
UnhandledPromiseRejectionWarning: Unhandled promise rejection (rejection id: 17): Error: Protocol error (Target.createTarget): Target closed.
This error occurs once upon every iteration of the while loop. Though I'm not sure what I may be doing incorrectly.
Could someone help me do the following:
1) Diagnose the source of the error.
2) Potentially find a more effective way to traverse a large list of URLs asynchronously.
async function subProc(list, batchSize) {
let subList = null;
let i = 0;
while (list.length > 0) {
let browser = await puppeteer.launch();
subList = list.splice(0, batchSize);
console.log("Master List Size :: " + list.length);
console.log("SubList Size :: " + subList.length);
for (let j = 0; j < subList.length; j++) {
promiseArray.push(new Promise((resolve, reject) => {
resolve(pageScrape(subList[j], browser));
}));
}
Promise.all(promiseArray)
.then(response => {
procArray.concat(response);
});
promiseArray = new Array();
try {
await browser.close();
} catch(ex){
console.log(ex);
}
};
}
async function pageScrape(url, browser) {
let page = await browser.newPage();
await page.goto(url, {
timeout: 0
});
await page.waitFor(1000);
return await page.evaluate(() => {
let appTitle = document.querySelector('').innerText;
let companyName = document.querySelector('').innerText;
let dateListed = document.evaluate("", document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue.innerText;
let category = document.evaluate("']//a//strong", document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue.innerText;
/* */
return {
appTitle,
companyName,
dateListed,
category
}
}).then(response => {
let urlData = {
id: subList[j],
appName: response.appTitle,
companyName: response.companyName,
dateListed: response.dateListed,
category: response.category
}
return urlData;
});
};

I figured out the solution to the problem I was having.
Every computer is limited in its processing ability, so instead of iterating through 1000 urls simultaneously you have to break it down into smaller pieces.
By using a PromiseAll, and iterating and scraping 10 urls at a time and storing these values in an array, I was able to throttle the processing required to iterate through all 1000 urls.
processBatch(subData, 10, procArray).then((processed)=>{
for(let i = 0; i < procArray.length; i++){
for(let j = 0; j < procArray[i].length; j++){
results.push(procArray[i][j]);
}
}
function processBatch(masterList, batchSize, procArray){
return Promise.all(masterList.splice(0, batchSize).map(async url =>
{
return singleScrape(url)
})).then((results) => {
if (masterList.length < batchSize) {
console.log('done');
procArray.push(results);
return procArray;
} else {
console.log('MasterList Size :: ' + masterList.length);
procArray.push(results);
return processBatch(masterList, batchSize, procArray);
}
})
}

Related

Quickly finding users by phone number with Firebase backend

I’m working on an app with a Firebase backend. During sign up I would like to let new users see which of their contacts are already on the app to add them as friends. So basically, use phone numbers to match users with contacts.
I am having a big performance headache when querying the database to find users.
Since Firestore does not support OR queries, I run two queries per phone number (one to check national format, the other for international format), and if any returns a document, set that document as the found user:
findUserByPhoneNumber = (number, callback) => {
//utility function to, well, sanitize phone numbers
sanitizeNumber = (str) => {
if (str) {
var num = str.match(/\d/g);
num = num.join("");
return num;
} else {
return null
}
}
var foundUser = null
Promise.all([
usersRef.where('phoneNumbers.nationalFormat', '==', sanitizeNumber(number)).get()
.then(snapshot => {
if (snapshot.docs.length > 0 && snapshot.docs[0].data()) {
// console.log('nationalFormat result: ', snapshot.docs[0]);
foundUser = snapshot.docs[0].data()
}
return foundUser
}),
usersRef.where('phoneNumbers.internationalFormat', '==', sanitizeNumber(number)).get()
.then(snapshot => {
if (snapshot.docs.length > 0 && snapshot.docs[0].data()) {
// console.log('internationalFormat result: ', snapshot.docs[0]);
foundUser = snapshot.docs[0].data()
}
return foundUser
})
])
.then(results => {
res = results.filter(el => { return el != null })
if (results.length > 0) {
callback(res[0])
}
})
}
findUserByPhoneNumber runs for each contact in a loop. When testing on my phone with 205 contacts, the whole process takes about 30 seconds, which is about 29 seconds longer than I would like, especially given the test database has only 8 records...
getContacts = () => {
getCs = () => {
// Declare arrays
const contactsWithAccount = []
const contactsWithNoAccount = []
// Get contacts from user's phone
Contacts.getAll((err, contacts) => {
if (err) throw err
// For each contact, iterate
for (var i = 0; i < contacts.length; i++) {
const item = contacts[i]
if (item.phoneNumbers && item.phoneNumbers.length > 0) {
const phone = item.phoneNumbers[0].number
// If the sanitized phone number is different from the current user's phone number (saved in DB), run the following logic
if (this.state.user.phoneNumbers.nationalFormat != sanitizeNumber(phone)
&& this.state.user.phoneNumbers.internationalFormat != sanitizeNumber(phone)
) {
findUserByPhoneNumber(phone, (fu) => {
contactObject = {
key: item.recordID,
name: item.givenName,
normalizedName: item.givenName.toLowerCase(),
phoneNumber: phone,
user: this.state.user,
hasAccount: null,
friendId: null,
isFriend: null
}
const foundUser = fu
// if found user, push in contactsWithAccount, otherwise push in contactsWithNoAccount
if (foundUser && foundUser._id != this.state.user._id) {
contactObject.hasAccount = true
contactObject.friendId = foundUser._id
if (this.state.user.friends && this.state.user.friends.includes(foundUser._id)) {
contactObject.isFriend = true
}
contactsWithAccount.push(contactObject)
}
else {
contactsWithNoAccount.push(contactObject)
}
// if the two arrays are filled up, run the callback
// NOTE_1: we use the two lengths +1 to account for the current
// user's document that we skip and dont add to any of the arrays
// NOTE_2: this bizare method was the only way to handle the results
// coming in asynchronously
if (contactsWithAccount.length + contactsWithNoAccount.length + 1 == contacts.length) {
console.log('finished');
sortCs(contactsWithAccount, contactsWithNoAccount)
}
})
}
}
}
})
}
// sorts the two arrays alphabetically
sortCs = (withAccount, withNoAccount) => {
compare = (a,b) => {
if (a.name < b.name)
return -1;
if (a.name > b.name)
return 1;
return 0;
}
withAccount.sort(compare)
withNoAccount.sort(compare)
this.setState({ withAccount, withNoAccount })
}
// unleash the monster
getCs(sortCs)
}
I am sure the process could be optimized in various ways. Maybe:
different database structure
bundling all queries into one
better use
of async
starting the process at an earlier step in the signup flow
Whatsapp, HouseParty and a bunch of other apps have this feature in place and it loads instantly. I’m not trying to reach that level of perfection yet but there must be some better way…
Any help/suggestions would be greatly appreciated.

Sails.js Async request

I would like to count how many entreprise are in some category but I'm stuck with the asynchrone concept.
Here's what I already have:
Category.getall(function(err, cat){
if(err) return res.negotiate(err);
catIds = []
for( var iCat in cat){
catIds.push(cat[iCat].id)
// and here I would like do something like
Entreprise.count({category_id: cat[iCat].id}, function(err, nbr){
categoriesOUT.push({categorie: cat, entreprise_number: nbr })
// I know that i can not do it but it's just to help to understand the logic I would like to have.
if(cat.length==iCat){
return res.json({categories: categoriesOUT})
}
})
}
})
There are a couple of ways to handle this. One would be to bring in a promise library like Q. Another would be a single database call that can count up enterprise objects grouped by category_id... however, I think that would go beyond Waterline's normal queries, you would have to use .query or .native or something.
The easiest quick fix for you is to just keep a counter of how many results you have handled. You may get tired of this approach after using it a couple of times, but it would look something like this:
Category.getall(function(err, cat){
if(err) { return res.negotiate(err); }
var catIds = [], categoriesOut = [], processedCategories = 0;
for( var iCat in cat){
catIds.push(cat[iCat].id)
Entreprise.count({category_id: cat[iCat].id}, function(err, nbr) {
if (err) {
categoriesOUT.push({categorie: cat, entreprise_number: 0});
} else {
categoriesOUT.push({categorie: cat, entreprise_number: nbr });
}
processedCategories += 1;
if (processedCategories >= cat.length) {
return res.json({categories: categoriesOUT});
}
});
}
});
Here's how I finaly get it only with MySQL request as suggered by #arbuthnott
(The category field is call domaine here)
Domaine.getall(function(err, domaines){
if(err){return res.negotiate(err)}
var domNames = {}, domContain = {}, domOut = [];
Entreprise.query('SELECT domaine_id, COUNT(*) FROM entreprise GROUP BY domaine_id', function(err, entreprises){
if(err){return res.negotiate(err)}
entreprises = JSON.parse(JSON.stringify(entreprises));
for(var ent of entreprises){
domContain[ent['domaine_id']] = ent['COUNT(*)'];
}
for(var iDom in domaines){
var countAdded = false;
for(var dc in domContain){
if(dc==domaines[iDom].id) {
domaines[iDom].entreprises_count = domContain[dc];
countAdded = true;
}
}
if(!countAdded) domaines[iDom].entreprises_count = 0;
}
res.json({domaines:domaines})
})
})

Cannot read property 'unsubscribe' of undefined

i have a ionic project.
I want to unsubscribe when i get the cities from subscription of firebase return.
But something goes wrong and it throws following error when i try to unsubscribe.
How can is solve that?
My code.
getCities() {
let i: any;
let a = this.firebaseProvider.getOtoparks()
.subscribe(data => {
for (i = 0; i < data.length; i++) {
this.cities.push({
name: data[i]['details']['sehir'],
value: i,
}
);
}
// try to unsubscribe
a.unsubscribe();
});
}
The best practice is to use take(1) when you want to unsubscribe without a condition.
getCities() {
let i: any;
let a = this.firebaseProvider.getOtoparks()
.take(1)
.subscribe(data => {
for (i = 0; i < data.length; i++) {
this.cities.push({
name: data[i]['details']['sehir'],
value: i,
});
}
});
}

Meteor call scope variable won't be set

I'm having quite a bit of trouble with the scope of the Meteor.call procedure. It won't set my scope variable to the result.length
'takeaways': function (userId) {
var len = 0;
Meteor.call('userTakeaways', userId, function (error, result) {
if (error) {
console.log('there was an error finding the number of messages that were takeaways')
} else {
len = result.length; // result.length is 2
}
});
console.log(len); // still 0
return len;
}
Please help!
Thank you :)
len is not a reactive variable. So if the len value changes, it won't update the spacebar value.
So here two approach to solving this problem:
1. using reactive var/session.
//Make sure you have install reactive var package
var len = new ReactiveVar(0);
Template['name'].helpers({
'takeaways': function (userId) {
Meteor.call('userTakeaways', userId, function (error, result) {
if (error) {
console.log('there was an error finding the number of messages that were takeaways')
} else {
len.set(result.length); // result.length is 2
}
});
console.log(len.get()); // You will get 2 when response come from you method call.
return len.get();
}
});
2. Using 'simple:reactive-method' package
takeaways : function(userId){
return ReactiveMethod.call('userTakeaways', userId).length;
}
try adding in the else statement
return len = result.length;
as you can see bellow.
'takeaways': function (userId) {
var len = 0;
Meteor.call('userTakeaways', userId, function (error, result) {
if (error) {
console.log('there was an error finding the number of messages that were takeaways')
} else {
return len = result.length; // result.length is 2
}
});
console.log(len); // still 0
return len;
}

Firebase Queue - Handling Reject/Resolve while Looping

I have a Queue that looks like this
new Queue(queueRef, options, ({post, user, postId}, progress, resolve, reject) => {
rootRef.child(`users/${user.user_id}/followers`).once('value', (snapshot) => {
const followers = toArray(snapshot.val())
for (var i = 0; i < followers.length; i++) {
rootRef.child(`users/${followers[i].user_id}/feed/${postId}`).set(post, (err) => {
if (err) {
reject(err)
} else if (i >= followers.length - 1) {
resolve({post, user, postId})
}
})
}
}, reject)
})
My issue is that I'm really only resolving once all the sets have finished and rejecting if any of those fail. What I'd like to do is somehow pass each iteration of a loop to another Queue which can then reject/resolve for that specific request rather than the whole collection.
This looks like it's probably an XY problem and probably has a better solution. But you're looking for something like Q.all().
In essence, call a method that does each op and returns a promise, and resolve/reject when the entire set is done.
new Queue(queueRef, options, ({post, user, postId}, progress, resolve, reject) => {
rootRef.child(`users/${user.user_id}/followers`).once('value', (snapshot) => {
var promiseList = [], p;
const followers = toArray(snapshot.val())
for (var i = 0; i < followers.length; i++) {
p = processNextFollower(followers[i]);
// p.then(progress);
promiseList.push(p);
}
Q.all(promiseList).then(resolve, reject);
}, reject)
})
function processNextFollower(follower, postId) {
var def = Q.defer();
rootRef.child(`users/${follower.user_id}/feed/${postId}`).set(post, (err) => {
if (err) {
def.reject(err)
} else if (i >= followers.length - 1) {
def.resolve({post, user, postId})
}
})
}
return def.promise;
}

Resources