How to scrape google news results in puppeteer JS? - web-scraping

I am currently working on scraping Google News pages. I am trying to scrape these pages with puppeteer but when I try to scrape it always returns me an empty result.
Here is my code:
const puppeteer = require('puppeteer')
const cheerio = require('cheerio')
const getNewsData = async (query) => {
let title = [] , url = [] , snippet = [] , imgSrc = [] , lastUpdated = [] , source = [];
const browser = await puppeteer.connect({
browserWSEndpoint: `wss://chrome-us.browsercloud.io?token=hided`,
});
const page = await browser.newPage();
try {
await page.goto("https://www.google.com/search?q="+query+"&tbm=nws&gl=us")
const elmHandle = await page.$("div.iRPxbe > div.mCBkyc");
title.push(elmHandle.textContent)
await browser.close();
console.log(title);
} catch (error) {
console.log("Error : " +error)
}
return [];
// Remember to catch errors and close!
};
getNewsData("football");
Please also help me to scrape news source, thumbnail and date.

You don't need any browser automation to get your information because it can get from a simple request, which needs fewer resources to do this. Check how to do this in the online IDE:
const cheerio = require("cheerio");
const axios = require("axios");
const searchString = "football"; // what we want to search
const encodedString = encodeURI(searchString); // what we want to search for in a browser-friendly language
const AXIOS_OPTIONS = {
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36",
}, // adding the User-Agent header as one way to prevent the request from being blocked
params: {
q: encodedString, // our encoded search string
tbm: "nws", // parameter defines the type of search you want to do ("nws" means news)
hl: 'en', // Parameter defines the language to use for the Google search
gl: 'us' // parameter defines the country to use for the Google search
},
};
function getNewsInfo() {
return axios
.get(`http://google.com/search`, AXIOS_OPTIONS)
.then(function ({ data }) {
let $ = cheerio.load(data);
const pattern = /s='(?<img>[^']+)';\w+\s\w+=\['(?<id>\w+_\d+)'];/gm;
const images = [...data.matchAll(pattern)].map(({ groups }) => ({ id: groups.id, img: groups.img.replace('\\x3d', '') }))
const allNewsInfo = Array.from($('.WlydOe')).map((el) => {
return {
link: $(el).attr('href'),
source: $(el).find('.CEMjEf span').text().trim(),
title: $(el).find('.mCBkyc').text().trim().replace('\n', ''),
snippet: $(el).find('.GI74Re').text().trim().replace('\n', ''),
image: images.find(({ id, img }) => id === $(el).find('.uhHOwf img').attr('id'))?.img || "No image",
date: $(el).find('.ZE0LJd span').text().trim(),
}
});
return allNewsInfo;
});
}
getNewsInfo();
Output:
[
{
"link":"https://www.cardchronicle.com/2022/7/11/23077819/madden-sanker-commits-to-louisville-football",
"source":"Card Chronicle",
"title":"Madden Sanker Commits to Louisville Football",
"snippet":"Louisville lands their highest rated offensive line recruit in program history.",
"image":"",
"date":"8 hours ago"
},
...and other results
]
You can also check my blog post Web Scraping Google News with Nodejs if you want to know more about this topic.

Check this answer, to get Google News Results:
const unirest = require("unirest");
const cheerio = require("cheerio");
const getNewsData = () => {
return unirest
.get("https://www.google.com/search?q=football&gl=us&tbm=nws")
.headers({
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36",
})
.then((response) => {
let $ = cheerio.load(response.body);
let news_results = [];
$(".BGxR7d").each((i,el) => {
news_results.push({
link: $(el).find("a").attr('href'),
title: $(el).find("div.mCBkyc").text(),
snippet: $(el).find(".GI74Re").text(),
date: $(el).find(".ZE0LJd span").text(),
thumbnail: $(el).find(".NUnG9d img").attr("src")
})
})
console.log(news_results)
});
};
getNewsData();
If you need an explanation of this code, I have written a blog also on how to scrape Google News Results:
https://serpdog.io/blog/web-scraping-google-news-results-with-node-js.html
Alternative:
You can use Google News API by Serpdog.
Serpdog also offers 100 free credits on the first signup.
Scraping can be time-consuming sometimes, but you can use this pre-cooked structured JSON data which makes your work easier and also you don't have to maintain the Google CSS selectors from time to time which is a big headache.
How to use:
const axios = require('axios');
axios.get('https://api.serpdog.io/news?api_key=APIKEY&q=football&gl=us')
.then(response => {
console.log(response.data);
})
.catch(error => {
console.log(error);
});
Results:
"news_results": [
{
"title": "Martin Bengtsson: football’s Swedish wonderkid whose dream died at Inter",
"snippet": "If Martin Bengtsson feels stressed he kicks a football around on his own and, almost immediately, the tension begins to ebb away.",
"source": "The Guardian",
"imgSrc": "",
"lastUpdated": "3 hours ago",
"rank": "1"
},
.....
Disclaimer: I am the founder of serpdog.io

Related

My http request not working with mailgun api

header = ds_map_create();
header[? "Content-Type"] = "application/json";
header[? "api"] = "My api key that im censoring :)";
json = ds_map_create()
{
json[? "from"] = "mailgun#lapaihui.org";
json[? "to"] = "My email that im also censoring :]";
json[? "subject"] = "Dear user";
json[? "text"] = "This is your game talking";
}
http_request("https://api.mailgun.net/v3/lapaihui.org/messages","POST",header,json);
basically im trying to send an email using mailgun api but something is just not working
if any netcode gods can help i will greatly apreciete it and credit it!
Without any information on the error you getting or why it's not working, I can't be certain what you need, but I have two working solutions that will hopefully help.
If you're sending from the browser:
function sendMgEmail(pFrom, pTo, pSubject, pText, mgApiKey){
const formData = new FormData();
formData.append('from', pFrom);
formData.append('to', pTo);
formData.append('subject', pSubject);
formData.append('text', pText);
const qXhr = new XMLHttpRequest;
const qMethod = 'POST';
const qUrl = 'https://api.mailgun.net/v3/{{YOUR_DOMAIN}}/messages';
qXhr.open(qMethod, qUrl);
qXhr.setRequestHeader('Authorization', 'Basic ' + window.btoa('api:' + mgApiKey));
qXhr.send(formData);
qXhr.onload = function() {
if(qXhr.status == '200' || qXhr.status == '201') {
console.log('email queued', qXhr.status, qXhr.responseText);
} else {
console.log('ERROR ', qXhr.status, qXhr.responseText);
}
}
}
If from a Nodejs application, the XMLHttpRequest approach does not seem to work:
First, refer to https://www.npmjs.com/package/mailgun.js?utm_source=recordnotfound.com#messages
Then, install form-data and mailgun.js
npm i form-data
npm i mailgun.js
Lastly, the code...
const FormData = require('form-data');
const Mailgun = require('mailgun.js');
exports.sendMgEmail(pFrom, pTo, pSubject, pText, mgApiKey) {
const mailgun = new Mailgun(FormData);
const mg = mailgun.client({username: 'api', key: mgApiKey})
mg.messages.create('{{YOUR_DOMAIN}}', {
from: pFrom,
to: pTo,
subject: pSubject,
text: pText
})
.then(msg => console.log(msg))
.catch(err => console.error(err));
}

How do I make an M-Pesa Callback URL using Firebase Cloud Firestore?

I'm trying to make an app that can send payments to PayBill numbers with Safaricom's "Lipa Na M-Pesa" (a Kenyan thing). The call is a POST request to URL:
https://sandbox.safaricom.co.ke/mpesa/stkpush/v1/processrequest
with header:
{
'Host': 'sandbox.safaricom.co.ke',
'Authorization': 'Bearer ${await mpesaAccessToken}',
'Content-Type': 'application/json',
}
and body:
{
"BusinessShortCode": "$businessShortCode",
"Password": "${generateLnmPassword(timeStamp)}",
"Timestamp": "$timeStamp",
"TransactionType": "CustomerPayBillOnline",
"Amount": "10",
"PartyA": "$userPhoneNumber",
"PartyB": "$businessShortCode",
"PhoneNumber": "$userPhoneNumber",
"CallBackURL": "?????????????????????????????",
"AccountReference": "account",
"TransactionDesc": "test",
}
I've received an access token, generated a password and made the call successfully, except for that CallBackURL thing... The M-Pesa docs describe their callback like this:
CallBackURL
This is the endpoint where you want the results of the transaction delivered. Same rules for Register URL API callbacks apply.
all API callbacks from transactional requests are POST requests, do not expect GET requests for callbacks. Also, the data is not formatted into application/x-www-form-urlencoded format, it is application/json, so do not expect the data in the usual POST fields/variables of your language, read the results directly from the incoming input stream.
(More info here, but you may need to be logged in: https://developer.safaricom.co.ke/get-started see "Lipa na M-Pesa")
My app is hosted on Firebase Cloud Firestore. Is there any way I can create a callback URL with them that will receive their callback as a document in a Firestore collection?...
Or would this be impossible, given that they would need authorization tokens and stuff to do so... and I can't influence what headers and body M-Pesa will send?
(PS Btw, I code in Flutter/Dart so plz don't answer in Javascript or anything! I'll be clueless... :p Flutter/Dart or just plain text will be fine. Thanks!)
Is there any way I can create a callback URL with them that will
receive their callback as a document in a Firestore collection?...
The most common way to do that in the Firebase ecosystem is to write an HTTPS Cloud Function that will be called by the Safaricom service.
Within the Cloud Function you will be able to update the Firestore document, based on the content of the POST request.
Something like:
exports.safaricom = functions.https.onRequest((req, res) => {
// Get the header and body through the req variable
// See https://firebase.google.com/docs/functions/http-events#read_values_from_the_request
return admin.firestore().collection('...').doc('...').update({ foo: bar })
.then(() => {
res.status(200).send("OK");
})
.catch(error => {
// ...
// See https://www.youtube.com/watch?v=7IkUgCLr5oA&t=1s&list=PLl-K7zZEsYLkPZHe41m4jfAxUi0JjLgSM&index=3
})
});
I did note that you ask us to not "answer in Javascript or anything" but in Flutter/Dart, but I don't think you will able to implement that in Flutter: you need to implement this webhook in an environment that you fully control and that exposes an API endpoint, like your own server or a Cloud Function.
Cloud Functions may seem complex at first sight, but implementing an HTTPS Cloud Functions is not that complicated. I suggest you read the Get Started documentation and watch the three videos about "JavaScript Promises" from the Firebase video series, and if you encounter any problem, ask a new question on SO.
Cloud functions are not Dart-based.
See below solution;
const functions = require("firebase-functions");
const admin = require("firebase-admin");
const parse = require("./parse");
admin.initializeApp();
exports.lmno_callback_url = functions.https.onRequest(async (req, res) => {
const callbackData = req.body.Body.stkCallback;
const parsedData = parse(callbackData);
let lmnoResponse = admin.firestore().collection('lmno_responses').doc('/' + parsedData.checkoutRequestID + '/');
let transaction = admin.firestore().collection('transactions').doc('/' + parsedData.checkoutRequestID + '/');
let wallets = admin.firestore().collection('wallets');
if ((await lmnoResponse.get()).exists) {
await lmnoResponse.update(parsedData);
} else {
await lmnoResponse.set(parsedData);
}
if ((await transaction.get()).exists) {
await transaction.update({
'amount': parsedData.amount,
'confirmed': true
});
} else {
await transaction.set({
'moneyType': 'money',
'type': 'deposit',
'amount': parsedData.amount,
'confirmed': true
});
}
let walletId = await transaction.get().then(value => value.data().toUserId);
let wallet = wallets.doc('/' + walletId + '/');
if ((await wallet.get()).exists) {
let balance = await wallet.get().then(value => value.data().moneyBalance);
await wallet.update({
'moneyBalance': parsedData.amount + balance
})
} else {
await wallet.set({
'moneyBalance': parsedData.amount
})
}
res.send("Completed");
});
Parse function.
const moment = require("moment");
function parse(responseData) {
const parsedData = {};
parsedData.merchantRequestID = responseData.MerchantRequestID;
parsedData.checkoutRequestID = responseData.CheckoutRequestID;
parsedData.resultDesc = responseData.ResultDesc;
parsedData.resultCode = responseData.ResultCode;
if (parsedData.resultCode === 0) {
responseData.CallbackMetadata.Item.forEach(element => {
switch (element.Name) {
case "Amount":
parsedData.amount = element.Value;
break;
case "MpesaReceiptNumber":
parsedData.mpesaReceiptNumber = element.Value;
break;
case "TransactionDate":
parsedData.transactionDate = moment(
element.Value,
"YYYYMMDDhhmmss"
).unix();
break;
case "PhoneNumber":
parsedData.phoneNumber = element.Value;
break;
}
});
}
return parsedData;
}
module.exports = parse;

how to pass format to google cloud translation API using the client library?

We are using google cloud translation API in our express application.
I am trying to do translations using the client library instead of making an API request every time.
1. What I want to know is how to pass the options like format (text or html) to the api while using the client library?
I can achieve this via making http requests using requestjs like this:
var request = require('request');
var url = 'https://translation.googleapis.com/language/translate/v2';
var options1 = {
q: 'amore mio',
target: 'hi',
format: 'text',
source: 'it',
key: 'my API key'
}
request.post({url:url, qs:options1}, (err, res, body)=> {
if(err) {
console.log('ERR: ', err);
}
console.log('RES: ', res.statusCode);
console.log('Body: ', body);
})
But the sample for using client library shows only this:
const {Translate} = require('#google-cloud/translate');
// Your Google Cloud Platform project ID
const projectId = 'YOUR_PROJECT_ID';
// Instantiates a client
const translate = new Translate({
projectId: projectId,
});
// The text to translate
const text = 'Hello, world!';
// The target language
const target = 'ru';
// Translates some text into Russian
translate
.translate(text, target)
.then(results => {
const translation = results[0];
console.log(`Text: ${text}`);
console.log(`Translation: ${translation}`);
})
.catch(err => {
console.error('ERROR:', err);
});
Is there a way I can pass options like 'format' using the client library?
How can I pass an array of strings to the q attribute (querystring) of the options object in the first method? If I pass an array directly like:
q: ['amore mio', 'grazie']
I get an error message :
RES: 400
Body: {
"error": {
"code": 400,
"message": "Required Text",
"errors": [
{
"message": "Required Text",
"domain": "global",
"reason": "required"
}
]
}
}
With respect to question 2 about passing the array of input arguments, this works fine if you use cURL to send the POST request similar to this example. I have tried it myself with success. I have tried to do different manipulations with your code from snipper 1 with the request library, but it seems as if the request library is not passing the array correctly. I would generally suggest using the client library which can successfully handle arrays in the input text.
Okay after a little research I just tried to pass options object with format and other properties (like source and target language) instead of target, and it worked.
So this can be achieved by:
const options = {
to: target,
format: 'html',
prettyPrint: true
}
translate
.translate(text, options)
.then(results => {
const translation = results[0];
console.log('flag: ', Array.isArray(translation));
console.log(`Text: ${text}`);
console.log(`Translation: ${translation}`);
})
.catch(err => {
console.error('ERROR:', err);
});
Use JSON.stringify
`https://translation.googleapis.com/language/translate/v2?q=${JSON.stringify([array]}`

Issue with sending LiveChat messages via DDP in RocketChat

I am trying to use the DDP Realtime API to initiate a LiveChat conversation but I am facing issues.
https://rocket.chat/docs/developer-guides/realtime-api/livechat-api
I am doing all the steps as per the documentation. In the first API outcome, you can see that it saus numAgents: 2 and online: true. However when I try to send a message to the same department, it says: "Sorry, no online agents".
Is there a way to find out the problem?
Result of livechat:getInitialData
{ enabled: true,
title: 'xyz.com',
color: '#C1272D',
registrationForm: false,
room: null,
triggers: [],
departments:
[ { _id: 'CxCTgXL4csw3TcW6S',
enabled: true,
name: 'Support',
description: '',
numAgents: 2,
showOnRegistration: true,
_updatedAt: 2017-09-24T06:46:39.657Z } ],
allowSwitchingDepartments: true,
online: true,
offlineColor: '#666666',
offlineMessage: 'We are not online right now. Please leave us a message:',
offlineSuccessMessage: '',
offlineUnavailableMessage: '',
displayOfflineForm: true,
videoCall: true,
offlineTitle: 'Leave a message',
language: '',
transcript: false,
transcriptMessage: 'Would you like a copy of this chat emailed?' }
Result of livechat:registerGuest
{ userId: 'j65Cp5peeLJLYhWQi',
token: 'J8IpnpB1yN1AYtO0e0EzLhuaRhe0zaZkjHBAamsehSO' }
Result of Login
{ id: 'j65Cp5peeLJLYhWQi',
token: 'J8IpnpB1yN1AYtO0e0EzLhuaRhe0zaZkjHBAamsehSO',
tokenExpires: 2017-12-23T07:45:01.928Z }
Result of sendMessageLivechat
{ isClientSafe: true,
error: 'no-agent-online',
reason: 'Sorry, no online agents',
message: 'Sorry, no online agents [no-agent-online]',
errorType: 'Meteor.Error' }
These are the parameters I am sending to sendMessageLiveChat.
"_id" : "j65Cp5peeLJLYhWQi"
"rid" : "a_random_string"
"msg": "Hello"
"token" : "J8IpnpB1yN1AYtO0e0EzLhuaRhe0zaZkjHBAamsehSO"
Could someone help me?
This is how I called registerGuest.
ddpClient.call("livechat:registerGuest",[{"token":authToken,"name":"test1","email":"test2#gmail.com","department":department._id},25],function(err, info){
});
the token passed by me here is the admin's authToken
The ddpClient object is obtained using the DDP npm package.
I solved this by a combination of
setting the bot as livechat agent & manager at the same time (I've read that tip somewhere it might be nonsense)
in Admin -> Omnichannel -> routing I've set 'accept even when no agents are online' (since my bot was never online, bould it was replying when DMessaged) + 'assign bot agents to new conversations'
I've setup myself a livechat-manager + livechat-agent role, but stayed in a different department, that way I can takeover
The rocket chat live api docs are quite out of date, just got stream-room-messages working because of a random forum post. Generally, registerGuest works with very minimal parameters as well, namely a random, self generated token + a name.
Here's my code for the complete setup
async subscribeToLiveRoom(message){
var _self = this
// let initial = await this.api
// .call("livechat:getInitialData",[token])
// register
const token = this.randomString()
var guestUser = await this.api
.call(
'livechat:registerGuest',
[{
token: token,
name: _self.$auth.user.name
}]
)
.catch(console.error)
console.log('guest', guestUser.visitor.token)
this.setActiveGuest(guestUser)
var roomId = this.randomString()
this.setActiveRoom(roomId)
let msg = await this.api
.call(
'sendMessageLivechat',
[{
_id: _self.randomString(),
rid: roomId,
msg: message,
token: guestUser.visitor.token
}])
.catch(console.error)
try {
let liveStream = await this.$subscribe("stream-livechat-room",[
roomId,
{
"useCollection": true,
"args":[
{
"visitorToken": guestUser.visitor.token
}
]
}
])
this.msgLive = await this.find('stream-livechat-room')
} catch (e) {
console.log(e)
}
//
try {
var roomStream = await this.$subscribe("stream-room-messages",[
roomId,
{
"useCollection": true,
"args":[
{
"visitorToken": guestUser.visitor.token
}
]
}
])
console.log('roomstream')
var update = this.find('stream-room-messages')
} catch (e) {
console.log('an error occured', e)
}
console.log( this.msg)
},
async sendToLiveRoom(message, rId){
var _self = this
// let initial = await this.api
// .call("livechat:getInitialData",[token])
// register
let msg = await this.api
.call(
'sendMessageLivechat',
[{
_id: _self.randomString(),
rid: rId,
msg: message,
token: _self.guest.visitor.token
}])
.catch(console.error)
},
By the way, since it's not well documented, you will get room-messages in livechat rooms via subscribing to stream-room-messages while you get room status changes (like switched to another agent) by subscribing to stream-livechat-room

Casperjs web service multithread

I am using a CasperJS script as a web service, accessing it from a node server. What I did not manage to do is to make Casper be 'multithread'. If I make two simultaneously requests to Casper from postman the result will be something scrambled between both requests for one response, and the second will be null. I saw that PhantomJS has a page principle, but I did not find anything similar for Casper.
Can i call Casper's web service with multiple requests at the same time and get correct/coherent responses?
Is there some configuration needed for the web server to allow me to do this?
Should the request be done in a 'special manner'? Are there any caveats regarding this that i should be aware of?
If it can only function sequentially, would starting multiple servers on the same machine but different ports solve the issue?
Here is the casper web service i am talking about. When I make a request like
locahost:1338/?query=name.name it will crawl for that query on the
specified url. My problem comes when I make 2 parallel requests with different queries.
//includes web server modules
"use strict";
var port = 1338;
var server = require('webserver').create();
var url = 'url to scrap';
//start web server
var service = server.listen(port, function(request, response) {
var arr1 = [];
var arr2 = [];
var arr3 = [];
var casper = require("casper").create({
verbose: true,
logLevel: 'error',
pageSettings: {
loadImages: false,
loadPlugins: false,
userAgent: 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36'
},
clientScripts: ["vendor/jquery-1.12.1.js"]
});
casper.start(url, function() {
}, function(){
console.log(url + " not found");
return;
});
casper.waitForSelector('.cssClass', function() {
}, function(){
console.log("not found");
return;
});
casper.then(function() {
var query = getQuery(request.url);
casper.sendKeys('.cssClass', query);
casper.click('.cssClass');
casper.waitForSelector('.cssClass', function(){
arr1 = this.evaluate(function(){
var nodeList = document.querySelectorAll(".cssClass");
return Array.prototype.map.call(nodeList, function(node){
return node.textContent;
});
});
}, function(){
console.log("not found");
return;
});
casper.then(function(){
if(names.length > 0)
{
casper.waitForSelector('.cssClass', function(){
arr2 = this.evaluate(function(){
var nodeList = document.querySelectorAll(".cssClass");
return Array.prototype.map.call(nodeList, function(node){
return node.textContent;
});
});
console.log("found");
}, function(){
console.log("not found");
return;
});
casper.waitForSelector('.cssClass', function(){
arr3 = this.evaluate(function(){
var nodeList = document.querySelectorAll(".cssClass");
return Array.prototype.map.call(nodeList, function(node){
return node.src;
});
});
console.log("found");
}, function(){
console.log("not found");
return;
});
}
});
});
casper.run(function() {
response.statusCode = 200;
response.write(JSON.stringify({p1: arr1, p2: arr2, p3: arr3}));
response.close();
});
});
console.log('\nServer running at http://localhost:' + port+'/');

Resources