How to store data from multi page to json? - web-scraping

thank you for ur attention, so i write a mini project that scrape news site and store main texts from them. i tried many solutions to add json in my project without have consol.log but always after scraping its show only one main text. so i show my code to you so u could help me how to have json with all three news.
const { Cluster } = require('puppeteer-cluster');
const fs = require('fs');
const launchOptions = {
headless: false,
args: [
'--disable-gpu',
'--disable-dev-shm-usage',
'--disable-web-security',
'--disable-xss-auditor',
'--disable-accelerated-2d-canvas',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
'--no-zygote',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-webgl',
],
ignoreHTTPSErrors: true,
waitUntil: 'networkidle2',
};
(async() => {
// Create a cluster with 2 workers
const cluster = await Cluster.launch({
monitor: true,
concurrency: Cluster.CONCURRENCY_PAGE,
maxConcurrency: 2,
puppeteerOptions: launchOptions,
});
// Define a task (in this case: screenshot of page)
await cluster.task(async({ page, data: url }) => {
await page.setRequestInterception(true);
page.on('request', (request) => {
if (['stylesheet', 'font', 'image', 'styles','other', 'media'].indexOf(request.resourceType()) !== -1) {
request.abort();
} else {
request.continue();
}
});
await page.goto(url);
const scrapedData = await page.$eval('div[class="entry-content clearfix"]', el => el.innerText)
fs.writeFileSync('test.json', JSON.stringify(scrapedData, null, 2))
});
// Add some pages to queue
cluster.queue('https://www.ettelaat.com/?p=526642');
cluster.queue('https://www.ettelaat.com/?p=526640');
cluster.queue('https://www.ettelaat.com/?p=526641');
// Shutdown after everything is done
await cluster.idle();
await cluster.close();
})();

for gather all outputs i had to put my fs in the bottom of cluster.close
kanopyDB = []
.
.
.
kanopyDB = kanopyDB.concat(name);
.
.
.
await cluster.idle();
await cluster.close();
fs.writeFileSync('output.json', kanopyDB, 'utf8');

Related

How to get the next sibling and ignore tokens in ts-morph?

I try to get the next sibling using getNextSibling from ts-morph.
The problem is I get the node DotToken or OpenBracketToken which I don't care about them. I want the node with the content. the problem this is can be any node (Identifier or StringLiteral or BinaryExpression or anything else).
Is there a safe way to get the node and ignore the tokens?
import { Identifier, Project, SyntaxKind } from "ts-morph";
console.clear();
const project = new Project();
const sourceFile = project.createSourceFile(
"foo.ts",
`
foo.bar.some();
foo[bar].some();
foo['bar'].some();
foo['bar' + 'other'].some();
`
);
const foos = sourceFile
.getDescendantsOfKind(SyntaxKind.Identifier)
.filter((i) => i.getText() === "foo");
foos.forEach((foo) => {
const s = foo.getNextSibling();
console.log({ s: s.getText() }); // outputs: . [ [ [
});
codesandbox.io
ts-ast-viewer.com

Generate paginated URLs like ?page=2, ?page=3 etc. in NextJS getStaticPaths and access the query params in getStaticProps

I am building a NextJS based site with multiple locales (different domains) where the data comes from storyblok CMS (folder level translation).
I am trying to figure out the best approach to statically generate the paginated URLs for the blog and since the data is known at build time, I figured the best approach would be to generate all URLs in getStaticPaths and then fetch the data for each Page in getStaticProps. This works fine for routes without parameters but when returning a page parameter along with the slug parameter in getStaticPaths, I cannot access it in getStaticProps.
I know that query params cannot be accessed in getStaticPaths because we cannot know the custom querys at buildtime, but in this specific case, we actually can since these paths are generated in getStaticProps.
pages/[[...slug]].jsx
import {
useStoryblokState,
getStoryblokApi,
StoryblokComponent,
} from "#storyblok/react";
export default function Page({
story,
locale,
locales,
defaultLocale,
stories,
}) {
story = useStoryblokState(story, {
// language: locale,
});
return (
<div>
<StoryblokComponent
blok={story.content}
storyData={story}
stories={stories}
/>
</div>
);
}
export async function getStaticProps({
locale,
locales,
defaultLocale,
params,
}) {
console.log(params.slug); // This logs the slug
console.log(params.page); // This logs undefined
console.log(params.query.page); // This logs undefined
// Empty slug on front page
// Make sure root element page pr folder are selected in storyblok
let slug = params.slug ? params.slug.join("/") : "";
let sbParams = {
version: "draft",
resolve_relations: relationsResolvers,
language: locale,
};
let { data } = await getStoryblokApi().get(
`cdn/stories/${locale}/${slug}`,
sbParams
);
let sbIndexParams = {
version: "draft",
resolve_relations: relationsResolvers,
per_page: 10,
page: params.page || 1,
starts_with: `${locale}/${slug}`,
sort_by: "first_published_at:desc",
language: locale,
filter_query: {
component: {
in: "page,post,case,template",
},
},
};
/* fetch an array of stories if page is startpage */
let storiesData = null;
if (data.story.is_startpage) {
storiesData = await getStoryblokApi().get(`cdn/stories`, sbIndexParams);
}
return {
props: {
story: data ? data.story : false,
key: data ? data.story.id : false,
stories:
data.story.is_startpage && storiesData
? storiesData.data.stories
.filter((story) => story.is_startpage == false)
.map((story) => {
return {
name: story.name,
created_at: story.created_at,
published_at: story.published_at,
id: story.id,
uuid: story.uuid,
slug: story.slug,
full_slug: story.full_slug,
is_startpage: story.is_startpage,
content: {
cover: story.content.cover ?? null,
cover_image: story.content.cover_image ?? null,
author: story.content.author ?? null,
category: story.content.category ?? null,
},
};
})
: false,
locale,
locales,
defaultLocale,
},
revalidate: 3600,
};
}
export async function getStaticPaths({ locales }) {
let { data } = await getStoryblokApi().get("cdn/links/", {
is_folder: false,
filter_query: {
component: {
in: "page,post,case,template",
},
},
});
let paths = [];
Object.keys(data.links).forEach((linkKey) => {
if (data.links[linkKey].is_folder) {
return;
}
// get array for slug because of catch all
const slug = data.links[linkKey].slug;
let splittedSlug = slug.split("/");
const linkLocale = splittedSlug[0];
splittedSlug.shift();
if (splittedSlug == "") splittedSlug = false;
// create additional languages
for (const locale of locales) {
if (linkLocale === locale) {
paths.push({ params: { slug: splittedSlug }, locale });
}
}
});
// pagination route generation on custom post types like posts and cases
const per_page = 10;
const startPagesArr = Object.values(data.links)
.map((obj) => obj)
.filter((obj) => obj.is_startpage == true)
.filter((obj) => obj.slug.split("/").length > 2);
// make a loop that loops through all startpages and fetches all stories that are children of that startpage
for (const startPage of startPagesArr) {
let res = await getStoryblokApi().get("cdn/links/", {
is_folder: false,
starts_with: startPage.slug,
paginated: 1,
page: 1,
per_page: per_page,
sort_by: "first_published_at:desc",
filter_query: {
component: {
in: "post,case,template",
},
},
});
let totalPages = Math.ceil(res.total / per_page);
let splittedSlug = startPage.slug.split("/");
const linkLocale = splittedSlug[0];
splittedSlug.shift();
if (splittedSlug == "") splittedSlug = false;
// ... Loop through locales and push the paginated pages to the paths Array
for (const locale of locales) {
if (linkLocale === locale) {
for (let i = 2; i <= totalPages; i++) {
paths.push({
params: {
slug: splittedSlug, // this is passed to the getStaticProps function
page: i, //this is not passed to the getStaticProps function
},
locale,
});
}
}
}
}
return {
paths: paths,
fallback: false,
};
}
Accessing the page query param in getStaticProps would solve the problem since I can pass that value to the API request and get the right blogposts to display on the right paginated pages.
Fetching data directly in the component is not preferable for SEO reasons since it will be client-side JS.
All the logic is for the whole site is in the pages/[[...slug.jsx]] file since there are multiple locales, but would it make sense to split it up so I have a dynamic file for the blog itself (across locales)?
I have tried returning the page query param in several different ways, but getStaticProps will only see the param that matches the filename (ex. params.slug will be accessible because the file is called [[...slug]].jsx].

Webkit2 with gjs - get response headers

I'm experimenting with gjs and webkit2, how can i get the http headers of a request made with load_uri
i have the following code
const Gtk = imports.gi.Gtk, WebKit=imports.gi.WebKit2, contentManager=new WebKit.UserContentManager,
view = WebKit.WebView.new_with_user_content_manager(contentManager);
Gtk.init(null);
let win = new Gtk.Window(), Response=new WebKit.URIResponse();
contentManager.add_script (new WebKit.UserScript("alert ('test');",0,1,null,null));
view.load_uri('https://www.gnome.org');
win.add(view);
win.set_title("test");
win.set_icon_from_file("/games/aptdaemon-resolve.png");
win.connect('destroy', () => { Gtk.main_quit(); });
win.set_size_request(640, 480);
win.show_all();
view.connect("load-changed",function (instance,state)
{
if (state == 3)
{
log ("URL"+Response.get_uri());
view.run_javascript ("alert (document.body.innerHTML)",null,null);
}
});
Gtk.main();
for example Response.get_uri returns an empty string, how to access response headers, and how to exchange messages between scripts injected with view.run_javascript and gjs. i want the body html be sent to gjs-?
got it
const Gtk = imports.gi.Gtk;
const WebKit=imports.gi.WebKit2;
Gtk.init(null);
const win = new Gtk.Window(), contentManager=new WebKit.UserContentManager, view = WebKit.WebView.new_with_user_content_manager(contentManager);
let response_STR;
contentManager.connect("script-message-received::pipe", function (instance, message)
{
message=message.get_js_value().to_string ();
log (message);
});
contentManager.register_script_message_handler("pipe");
view.load_uri('https://www.gnome.org');
win.add(view);
win.set_title("test");
win.connect('destroy', () => { Gtk.main_quit(); });
win.set_size_request(640, 480);
win.show_all();
view.connect("load-changed",function (instance,status)
{
let headers, response_STR="";
if (status == 3)
{
/* WebKitView.get_main_resource -> returns WebResource
WebResource.get_response -> returns URIResponse
URIResponse.get_http_headers -> returns Soup.MessageHeaders */
headers=view.get_main_resource().get_response().get_http_headers();
response_STR="";
headers.foreach ((name, value) => { response_STR+=name+": "+value+"\n"});
view.run_javascript('window.webkit.messageHandlers.pipe.postMessage(document.body.innerHTML);', null, null);
log (response_STR);
}
});
Gtk.main();

With Strapi 4 how can I get each users music events

I'm using strapi 4 with nextjs.
In the app strapi holds music events for each user and each user should be able add and retrieve there own music events.
I am having trouble retrieving
each users music events from strapi 4
I have a custom route and custom controller
The custom route is in a file called custom-event.js and works ok it is as follows:
module.exports = {
routes: [
{
method: 'GET',
path: '/events/me',
handler: 'custom-controller.me',
config: {
me: {
auth: true,
policies: [],
middlewares: [],
}
}
},
],
}
The controller id a file called custom-controller.js and is as follows:
module.exports = createCoreController(modelUid, ({strapi }) => ({
async me(ctx) {
try {
const user = ctx.state.user;
if (!user) {
return ctx.badRequest(null, [
{messages: [{ id: 'No authorization header was found'}]}
])
}
// The line below works ok
console.log('user', user);
// The problem seems to be the line below
const data = await strapi.services.events.find({ user: user.id})
// This line does not show at all
console.log('data', data);
if (!data) {
return ctx.notFound()
}
return sanitizeEntity(data, { model: strapi.models.events })
} catch(err) {
ctx.body = err
}
}
}))
Note there are two console.logs the first console.log works it outputs the user info
The second console.log outputs the data it does not show at all. The result I get back
using insomnia is a 200 status and an empty object {}
The following line in the custom-controller.js seems to be where the problem lies it works for strapi 3 but does not seem to work for strapi 4
const data = await strapi.services.events.find({ user: user.id})
After struggling for long time, days infact, I eventually got it working. Below is the code I came up with. I found I needed two queries to the database, because I could not get the events to populate the images with one query. So I got the event ids and then used the event ids in a events query to get the events and images.
Heres the code below:
const utils = require('#strapi/utils')
const { sanitize } = utils
const { createCoreController } = require("#strapi/strapi").factories;
const modelUid = "api::event.event"
module.exports = createCoreController(modelUid, ({strapi }) => ({
async me(ctx) {
try {
const user = ctx.state.user;
if (!user) {
return ctx.badRequest(null, [
{messages: [{ id: 'No authorization header was found'}]}
])
}
// Get event ids
const events = await strapi
.db
.query('plugin::users-permissions.user')
.findMany({
where: {
id: user.id
},
populate: {
events: { select: 'id'}
}
})
if (!events) {
return ctx.notFound()
}
// Get the events into a format for the query
const newEvents = events[0].events.map(evt => ({ id: { $eq: evt.id}}))
// use the newly formatted newEvents in a query to get the users
// events and images
const eventsAndMedia = await strapi.db.query(modelUid).findMany({
where: {
$or: newEvents
},
populate: {image: true}
})
return sanitize.contentAPI.output(eventsAndMedia,
strapi.getModel(modelUid))
} catch(err) {
return ctx.internalServerError(err.message)
}
}
}))

firebase snapshot change and angular2

i'm facing a little issue with firebase and angular2. I'm loading some little task and users can apply to them.
loadTask() {
let post = []
const publicationRef = this.rootRef.child('task').child('attente');
const usersR = this.rootRef.child('users')
return Observable.create((observer) => {
this.geoQuery.on("key_entered", function(key, location, distance){
publicationRef.child(key).on('value', snapshot => {
const taskkey = snapshot.key()
snapshot.forEach((data) => {
const datakey = data.key();
const dataVal = data.val();
usersR.orderByKey().equalTo(dataVal.user).on('child_added', snapshot => {
console.log({key: datakey,nom: snapshot.val().nom, distances: distance})
post.push({nom: dataVal.nom, key: datakey,unom: snapshot.val().nom, uprenom: snapshot.val().prenom, distance: Math.floor(distance), uimage:snapshot.val().image, budget: dataVal.budget, description: dataVal.description, date:dataVal.date, location: dataVal.location, user: dataVal.user, id1: taskkey, id:datakey})
observer.next(post)
})
})
})
})
})
}
all the task are displayed but when i apply to a task , in the dom it call a new push method , so a new task with the same value appear and disseapear when i reload the page. i tried to remove the push method
loadTask() {
const publicationRef = this.rootRef.child('task').child('attente');
const usersR = this.rootRef.child('users')
return Observable.create((observer) => {
this.geoQuery.on("key_entered", function(key, location, distance){
publicationRef.child(key).on('value', snapshot => {
const taskkey = snapshot.key()
snapshot.forEach((data) => {
const datakey = data.key();
const dataVal = data.val();
usersR.orderByKey().equalTo(dataVal.user).on('child_added', snapshot => {
console.log({key: datakey,nom: snapshot.val().nom, distances: distance})
const publicio = [{nom: dataVal.nom, key: datakey,unom: snapshot.val().nom, uprenom: snapshot.val().prenom, distance: Math.floor(distance), uimage:snapshot.val().image, budget: dataVal.budget, description: dataVal.description, date:dataVal.date, location: dataVal.location, user: dataVal.user, id1: taskkey, id:datakey}]
observer.next(publicio)
})
})
})
})
})
}
apply to a task doesnt add a new data with the same value , but in the dom it only load one set of data and not all of them
Try to clear post in the "on" method:
publicationRef.child(key).on('value', snapshot => {
post = [];
const taskkey = snapshot.key();
.
.
.

Resources