Open multiple links in casperjs - web-scraping

I am trying to scrape all links of special kind (boxscore-links) from this website http://www.basketball-reference.com/teams/GSW/2016_games.html and then visit them one by one, scraping some information from every visited link. For a beginning I want to scrape all links, visit them one by one and get a title of website. The problem is that it always prints the same title and the same current url (initial url) even though it clearly has to be a new one. Seems to me that there is a problem with 'this'-keyword...
(Don't look at limit of links, I took the code from sample on github of casperjs and I left it for console not to be overloaded.)
This is my code:
var casper = require("casper").create({
verbose: true
});
// The base links array
var links = [ "http://www.basketball-reference.com/teams/GSW/2016_games.html" ];
// If we don't set a limit, it could go on forever
var upTo = ~~casper.cli.get(0) || 10;
var currentLink = 0;
// Get the links, and add them to the links array
function addLinks(link) {
this.then(function() {
var found = this.evaluate(searchLinks);
this.echo(found.length + " links found on " + link);
links = links.concat(found);
});
}
// Fetch all <a> elements from the page and return
// the ones which contains a href starting with 'http://'
function searchLinks() {
var links = document.querySelectorAll('#teams_games td:nth-child(5) a');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href');
});
}
// Just opens the page and prints the title
function start(link) {
this.start(link, function() {
this.wait(5000, function() {
this.echo('Page title: ' + this.getTitle());
this.echo('Current url: ' + this.getCurrentUrl());
});
});
}
// As long as it has a next link, and is under the maximum limit, will keep running
function check() {
if (links[currentLink] && currentLink < upTo) {
this.echo('--- Link ' + currentLink + ' ---');
start.call(this, links[currentLink]);
addLinks.call(this, links[currentLink]);
currentLink++;
this.run(check);
} else {
this.echo("All done.");
this.exit();
}
}
casper.start().then(function() {
this.echo("Starting");
});
casper.run(check);

Considering an array of URLs, you can iterate over them, visiting each in succession with something like the following:
casper.each(urls, function(self, url) {
self.thenOpen(url, function(){
this.echo('Opening: ' + url);
// Do Whatever
});
});
Obviously this will not find links on a page, but it is a nice way to go over a known set of URLs.

Related

Fullcalendar using resources as a function with select menu

Using Fullcalendar 4, I am trying to show/hide my resources using a select menu. When the user selects one of the providers from a menu, I want to only show that one resourc's events.
Above my fullcalendar I have my select menu:
<select id="toggle_providers_calendar" class="form-control" >
<option value="1" selected>Screech Powers</option>
<option value="2">Slater</option>
</select>
I am gathering the resources I need using an ajax call on my included fullcalendar.php page. I am storing them in an object and then trying to control which resources are shown onscreen:
document.addEventListener('DOMContentLoaded', function() {
var resourceData = [];
$.getJSON('ajax_get_json.php?what=schedule_providers_at_location',
function(data) {
$.each(data, function(index) {
resourceData.push({
id: data[index].value,
title: data[index].text
});
});
console.log(resourceData);
});
//below, set the visible resources to whatever is selected in the menu
//using 1 in order for that to show at start
var visibleResourceIds = ["1"];
//below, get the selected id when the the menu is changed and use that in the toggle resource function
$('#toggle_providers_calendar').change(function() {
toggleResource($('#toggle_providers_calendar').val());
});
var calendar_full = document.getElementById('calendar_full');
var calendar = new FullCalendar.Calendar(calendar_full, {
events: {
url: 'ajax_get_json.php?what=location_appointments'
},
height: 700,
resources: function(fetchInfo, successCallback, failureCallback) {
// below, I am trying to filter resources by whether their id is in visibleResourceIds.
var filteredResources = [];
filteredResources = resourceData.filter(function(x) {
return visibleResourceIds.indexOf(x.id) !== -1;
});
successCallback(filteredResources);
},
...
});
// below, my toggle_providers_calendar will trigger this function. Feed it resourceId.
function toggleResource(resourceId) {
var index = visibleResourceIds.indexOf(resourceId);
if (index !== -1) {
visibleResourceIds.splice(index, 1);
} else {
visibleResourceIds.push(resourceId);
}
calendar.refetchResources();
}
To make sure the getJSON is working, I have console.log(resourceData). The information in the console once it's gathered is:
[{id: '1', title: 'Screech Powers'}, {id: '2', title: 'Slater}]
... the above are the correct resources that can be chosen/rendered. So that seems to be okay.
On page load, no resources show at all, when resource id of '1' (Screech Powers) should be shown per my code. Well, at least, that's what I am trying to do right now.
When the menu changes, resources will show/hide, but not based on what's selected; the logic of only showing what is selected in the menu doesn't seem to be working.
I used to use a URL request for my resources: 'ajax_get_json.php?what=schedule_providers_at_location', and it worked fine! All resources show then their events properly. I am just trying to modify it by using a menu to show/hide the resources as needed.
Here's what I'm doing to make it happen so far! In case someone comes across this post ever, this will help.
Here's my code before my fullcalendar code.
var resourceData = [];
var visibleResourceIds = [];
$.getJSON('ajax_get_json.php?what=schedule_providers_at_location',
function(data) {
$.each(data, function(index) {
resourceData.push({
id: data[index].value,
title: data[index].text
});
});
});
$('#toggle_providers_calendar').change(function() {
toggleResource($('#toggle_providers_calendar').val());
});
My select menu with id 'toggle_providers_calendar' is the same as my original post. My fullcalendar resources as a function is the same too.
After the calendar is rendered, here are the changes I made to my toggle resources function:
// menu button/dropdown will trigger this function. Feed it resourceId.
function toggleResource(resourceId) {
visibleResourceIds = [];
//if select all... see if undefined from loading on initial load = true
if ((resourceId == '') || (resourceId === undefined)) {
$.map( resourceData, function( value, index ) {
visibleResourceIds.push(value.id);
});
}
var index = visibleResourceIds.indexOf(resourceId);
if (index !== -1) {
visibleResourceIds.splice(index, 1);
} else {
visibleResourceIds.push(resourceId);
}
calendar.refetchResources();
}
This causes the resources to show and hide properly. If the user selects "Show All" that works too!
In order to have a default resource show on load, I add this to my fullcalendar script:
loading: function(bool) {
if (bool) {
//insert code if still loading
$('.loader').show();
} else {
$('.loader').hide();
if (initial_load) {
initial_load = false;
//code here once done loading and initial_load = true
var default_resource_to_show = "<?php echo $default_provider; ?>";
if (default_resource_to_show) {
//set the menu to that provider and trigger the change event to toggleresrource()
$('#toggle_providers_calendar').val(default_provider).change();
} else {
//pass in nothing meaning 'select all' providers for scheduler to see
toggleResource();
}
}
}
},
I am using a bool variable of initial_load to see if the page was just loaded (basically not loading data without a page refresh). The bool of initial_load = true is set outside of DOMContentLoaded
<script>
//show selected date in title box
var initial_load = true;
document.addEventListener('DOMContentLoaded', function() {
My only current problem is that when toggleResource function is called, the all day vertical time block boundaries don't line up with the rest of the scheduler. Once I start navigating, they do, but I don't understand why it looks like this on initial load or when toggleResource() is called:
Any thoughts on how to correct the alignment of the allday vertical blocks?

How to make TagsInput to work with both auto complete & free text

UPDATE
This issue is already discussed in github here
I am using tagsinput with typeahead in bootstrap 3. The problem which I am experiencing is with the value in case if user selects the existing tag. Display text shows it right but .val() returns its actual object. Below is the code
$('#tags').tagsinput({
//itemValue: 'value',
typeahead: {
source: function (query) {
//tags = [];
//map = {};
return $.getJSON('VirtualRoomService.asmx/GetTags?pid=' + $("#<%=hdnPID.ClientID%>").val() + '&tok=' + query)
//, function (data) {
// $.each(data, function (i, tag) {
// map[tag.TagValue] = tag;
// tags.push(tag.TagValue);
// });
// return process(tags);
//});
}
}
//freeElementSelector: "#freeTexts"
});
The problem with above code is that it results as below while fetching tags from web method
This happens when user select the existing tag. New tags no issues. I tried setting itemValue & itemText of tagsinput but not worked. Hence I decided a work-around of this problem. Since I could able get the json string as ['IRDAI", Object], if can somehow parse these object & get the actual tag value then I get the expected result of the code I am looking at.
Below is what it appears in tags input as [object Object] for text selected by user from auto populated drop down
[![enter imt
If I i specify TagId & TagValue to itemValue & itemText as below code
$('#tags').tagsinput({
itemValue: 'TagId',
itemText: 'TagValue',
typeahead: {
source: function (query) {
//tags = [];
//map = {};
return $.getJSON('VirtualRoomService.asmx/GetTags?pid=' + $("#<%=hdnPID.ClientID%>").val() + '&tok=' + query)
//, function (data) {
// $.each(data, function (i, tag) {
// //map[tag.TagValue] = tag;
// tags.push(tag.TagValue);
// });
//});
// return process(tags);
}
}
//freeElementSelector: "#freeTexts"
});
Then the result is displaying as below when below code is executed
var arr = junit.Tags.split(',');
for (var i = 0; i < arr.length; i++) {
$('#tags').tagsinput('add', arr[i]);
}
Given your example JSON response from your data source:
[
{"TagId":"1", "TagValue":"eSign"},
{"TagId":"2", "TagValue":"eInsurance Account"}
]
You'll need to tell tagsinput how to map the attributes from your response objects using itemValue and itemText in your tagsinput config object. It looks like you may have started down that path, but didn't reach the conclusion, which should look something like:
$('#tags').tagsinput({
itemValue: 'TagId',
itemText: 'TagValue',
typeahead: {
source: function (query) {
return $.getJSON('VirtualRoomService.asmx/GetTags?pid=' + $("#<%=hdnPID.ClientID%>").val() + '&tok=' + query);
}
}
});
Be sure to checkout the tagsinput examples.
This may not be the clean solution but I got around this issue through below parsing method. Hope this helps someone.
var items = $('#tags').tagsinput("items");
var tags = '';
for(i = 0; i < items.length; i++)
{
if(JSON.stringify(items[i]).indexOf('{') >= 0) {
tags += items[i].TagValue;
tags += ',';
} else {
tags += items[i];
tags += ',';
}
}

Jasmine - Testing links via Webdriver I/O

I have been working on a end-to-end test using Webdriver I/O from Jasmine. One specific scenario has been giving me significant challenges.
I have a page with 5 links on it. The number of links actually challenges as the page is dynamic. I want to test the links to see if each links' title matches the title of the page that it links to. Due to the fact that the links are dynamically generated, I cannot just hard code tests for each link. So, I'm trying the following:
it('should match link titles to page titles', function(done) {
client = webdriverio.remote(settings.capabilities).init()
.url('http://www.example.com')
.elements('a').then(function(links) {
var mappings = [];
// For every link store the link title and corresponding page title
var results = [];
for (var i=0; i<links.value.length; i++) {
mappings.push({ linkTitle: links.value[0].title, pageTitle: '' });
results.push(client.click(links.value[i])
.getTitle().then(function(title, i) {
mappings[i].pageTitle = title;
});
);
}
// Once all promises have resolved, compared each link title to each corresponding page title
Promise.all(results).then(function() {
for (var i=0; i<mappings.length; i++) {
var mapping = mappings[i];
expect(mapping.linkTitle).toBe(mapping.pageTitle);
}
done();
});
});
;
});
I'm unable to even confirm if I'm getting the link title properly. I believe there is something I entirely misunderstand. I am not even getting each links title property. I'm definately not getting the corresponding page title. I think I'm lost in closure world here. Yet, I'm not sure.
UPDATE - NOV 24
I still have not figured this out. However, i believe it has something to do with the fact that Webdriver I/O uses the Q promise library. I came to this conclusion because the following test works:
it('should match link titles to page titles', function(done) {
var promise = new Promise(function(resolve, reject) {
setTimeout(function() { resolve(); }, 1000);
});
promise.then(function() {
var promises = [];
for (var i=0; i<3; i++) {
promises.push(
new Promise(function(resolve, reject) {
setTimeout(function() {
resolve();
}, 500);
})
);
}
Promise.all(promises).then(function() {
expect(true).toBe(true)
done();
});
});
However, the following does NOT work:
it('should match link titles to page titles', function(done) {
client = webdriverio.remote(settings.capabilities).init()
.url('http://www.example.com')
.elements('a').then(function(links) {
var mappings = [];
// For every link store the link title and corresponding page title
var results = [];
for (var i=0; i<links.value.length; i++) {
mappings.push({ linkTitle: links.value[0].title, pageTitle: '' });
results.push(client.click(links.value[i])
.getTitle().then(function(title, i) {
mappings[i].pageTitle = title;
});
);
}
// Once all promises have resolved, compared each link title to each corresponding page title
Q.all(results).then(function() {
for (var i=0; i<mappings.length; i++) {
var mapping = mappings[i];
expect(mapping.linkTitle).toBe(mapping.pageTitle);
}
done();
});
})
;
});
I'm not getting any exceptions. Yet, the code inside of Q.all does not seem to get executed. I'm not sure what to do here.
Reading the WebdriverIO manual, I feel like there are a few things wrong in your approach:
elements('a') returns WebElement JSON objects (https://code.google.com/p/selenium/wiki/JsonWireProtocol#WebElement_JSON_Object) NOT WebElements, so there is no title property thus linkTitle will always be undefined - http://webdriver.io/api/protocol/elements.html
Also, because it's a WebElement JSON object you cannot use it as client.click(..) input, which expects a selector string not an object - http://webdriver.io/api/action/click.html. To click a WebElement JSON Object client.elementIdClick(ID) instead which takes the ELEMENT property value of the WebElement JSON object.
When a client.elementIdClick is executed, the client will navigate to the page, trying to call client.elementIdClick in the next for loop cycle with next ID will fail, cause there is no such element as you moved away from the page. It will sound something like invalid element cache.....
So, I propose another solution for your task:
Find all elements as you did using elements('a')
Read href and title using client.elementIdAttribute(ID) for each of the elements and store in an object
Go through all of the objects, navigate to each of the href-s using client.url('href'), get the title of the page using .getTitle and compare it with the object.title.
The source I experimented with, not run by Jasmine, but should give an idea:
var client = webdriverio
.remote(options)
.init();
client
.url('https://www.google.com')
.elements('a')
.then(function (elements) {
var promises = [];
for (var i = 0; i < elements.value.length; i++) {
var elementId = elements.value[i].ELEMENT;
promises.push(
client
.elementIdAttribute(elementId, 'href')
.then(function (attributeRes) {
return client
.elementIdAttribute(elementId, 'title')
.then(function (titleRes) {
return {href: attributeRes.value, title: titleRes.value};
});
})
);
}
return Q
.all(promises)
.then(function (results) {
console.log(arguments);
var promises = [];
results.forEach(function (result) {
promises.push(
client
.url(result.href)
.getTitle()
.then(function (title) {
console.log('Title of ', result.href, 'is', title, 'but expected', result.title);
})
);
});
return Q.all(promises);
});
})
.then(function () {
client.end();
});
NOTE:
This fails to solve your problem, when the links trigger navigation with JavaScript event handlers not the href attributes.

How to make Chrome Extension run for each new Iframe added?

I created a Chrome Extension as a solution to override the helpText bubbles in SalesForce Console pages. The helpText bubbles show up the text without the ability to link URLs. It looks like this:
The extension is taking the helpText bubble (which in the SalesForce console window, is inside an iFrame) and makes the URL click-able. It also adds word wrap and marks the links in blue.
The solution works fine when the page loads with the initial iFrame (or iFrames) on it, meaning when you open the SalesForce console the first time (https://eu3.salesforce.com/console).
When a new tab is created at the SalesForce console, my inject script doesn't run.
Can you please assist in understanding how to inject the script on each and every new Tab SalesForce Console is creating?
The Extension as follows:
manifest.js:
{
"browser_action": {
"default_icon": "icons/icon16.png"
},
"content_scripts": [ {
"all_frames": true,
"js": [ "js/jquery/jquery.js", "src/inject/inject.js" ],
"matches": [ "https://*.salesforce.com/*", "http://*.salesforce.com/*" ]
} ],
"default_locale": "en",
"description": "This extension Fix SalesForce help bubbles",
"icons": {
"128": "icons/icon128.png",
"16": "icons/icon16.png",
"48": "icons/icon48.png"
},
"manifest_version": 2,
"name": "--Fix SalesForce bubble text--",
"permissions": [ "https://*.salesforce.com/*", "http://*.salesforce.com/*" ],
"update_url": "https://clients2.google.com/service/update2/crx",
"version": "5"
}
And this is the inject.js:
chrome.extension.sendMessage({}, function(response) {
var readyStateCheckInterval = setInterval(function() {
if (document.readyState === "complete") {
clearInterval(readyStateCheckInterval);
var frame = jQuery('#servicedesk iframe.x-border-panel');
frame = frame.contents();
function linkify(inputText) {
var replacedText, replacePattern1, replacePattern2, replacePattern3;
var originalText = inputText;
//URLs starting with http://, https://, file:// or ftp://
replacePattern1 = /(\b(https?|ftp|file):\/\/[-A-Z0-9+&##\/%?=~_|!:,.;]*[-A-Z0-9+&##\/%=~_|])/gim;
replacedText = inputText.replace(replacePattern1, '$1');
//URLs starting with "www." (without // before it, or it'd re-link the ones done above).
replacePattern2 = /(^|[^\/f])(www\.[\S]+(\b|$))/gim;
replacedText = replacedText.replace(replacePattern2, '$1$2');
//Change email addresses to mailto:: links.
replacePattern3 = /(([a-zA-Z0-9\-\_\.])+#[a-zA-Z\_]+?(\.[a-zA-Z]{2,6})+)/gim;
replacedText = replacedText.replace(replacePattern3, '$1');
//If there are hrefs in the original text, let's split
// the text up and only work on the parts that don't have urls yet.
var count = originalText.match(/<a href/g) || [];
if(count.length > 0){
var combinedReplacedText;
//Keep delimiter when splitting
var splitInput = originalText.split(/(<\/a>)/g);
for (i = 0 ; i < splitInput.length ; i++){
if(splitInput[i].match(/<a href/g) == null){
splitInput[i] = splitInput[i].replace(replacePattern1, '$1').replace(replacePattern2, '$1$2').replace(replacePattern3, '$1');
}
}
combinedReplacedText = splitInput.join('');
return combinedReplacedText;
} else {
return replacedText;
}
}
var helpOrbReady = setInterval(function() {
var helpOrb = frame.find('.helpOrb');
if (helpOrb) {
clearInterval(helpOrbReady)
} else {
return;
}
helpOrb.on('mouseout', function(event) {
event.stopPropagation();
event.preventDefault();
setTimeout(function() {
var helpText = frame.find('.helpText')
helpText.css('display', 'block');
helpText.css('opacity', '1');
helpText.css('word-wrap', 'break-word');
var text = helpText.html()
text = text.substr(text.indexOf('http'))
text = text.substr(0, text.indexOf(' '))
var newHtml = helpText.html()
helpText.html(linkify(newHtml))
}, 500); });
}, 1000);
}
}, 1000);
});
It is possible (I have not tested it, but it sounds plausible from a few questions I've seen here) that Chrome does not automatically inject manifest-specified code into newly-created <iframe> elements.
In that case, you will have to use a background script to re-inject your script:
chrome.runtime.onMessage.addListener( function(request, sender, sendResponse) {
if(request.reinject) {
chrome.tabs.executeScript(
sender.tab.id,
{ file: "js/jquery/jquery.js", "all_frames": true },
function(){
chrome.tabs.executeScript(
sender.tab.id,
{ file: "js/inject/inject.js", "all_frames": true }
);
}
);
});
Content script:
// Before everything: include guard, ensure injected only once
if(injected) return;
var injected = true;
function onNewIframe(){
chrome.runtime.sendMessage({reinject: true});
}
Now, I have many questions about your code, which are not directly related to your question.
Why the pointless sendMessage wrapper? No-one is even listening, so your code basically returns with an error set.
Why all the intervals? Use events instead of polling.
If you are waiting on document to become ready, jQuery offers $(document).ready(...)
If you're waiting on DOM modifications, learn to use DOM Mutation Observers, as documented and as outlined here or here. This would be, by the way, the preferred way to call onNewIframe().

Switch to iframe with phantom.js

I would like to switch to an iframe using pure phantom.js code
Here is my first attempt
var page = new WebPage();
var url = 'http://www.theurltofectch'
page.open(url, function (status) {
if ('success' !== status) {
console.log("Error");
} else {
page.switchToFrame("thenameoftheiframe");
console.log(page.content);
phantom.exit();
}
});
It produces only the source code of the main page. Any idea ?
Notice that the iframe domain is different from the main page domain.
Please give this a try I believe it may be an async issues meaning the iframe is not present when trying to access it. I received the below snippet from another post.
var page = require('webpage').create(),
testindex = 0,
loadInProgress = false;
page.onConsoleMessage = function(msg) {
console.log(msg);
};
page.onLoadStarted = function() {
loadInProgress = true;
console.log("load started");
};
page.onLoadFinished = function() {
loadInProgress = false;
console.log("load finished");
};
/*
page.onNavigationRequested = function(url, type, willNavigate, main) {
console.log('Trying to navigate to: ' + url);
console.log('Caused by: ' + type);
console.log('Will actually navigate: ' + willNavigate);
console.log('Sent from the page\'s main frame: ' + main);
};
*/
/*
The steps array represents a finite set of steps in order to perform the unit test
*/
var steps = [
function() {
//Load Login Page
page.open("https://www.yourpage.com");
},
function() {
//access your iframe here
page.evaluate(function() {
});
},
function() {
//any other step you want
page.evaluate(function() {
});
},
function() {
// Output content of page to stdout after form has been submitted
page.evaluate(function() {
//console.log(document.querySelectorAll('html')[0].outerHTML);
});
//render a test image to see if login passed
page.render('test.png');
}
];
interval = setInterval(function() {
if (!loadInProgress && typeof steps[testindex] === "function") {
console.log("step " + (testindex + 1));
steps[testindex]();
testindex++;
}
if (typeof steps[testindex] !== "function") {
console.log("test complete!");
phantom.exit();
}
}, 50);
replace
console.log(page.content);
with
console.log(page.frameContent);
Should return the contents of the frame phantomjs switched to.
If the iframe is from another domain you may need to add the --web-security=no option like this:
phantomjs --web-security=no myscript.js
As an additional information, what xMythicx said could be true. Some iframes are rendered via Javascript after page finishes loading. If the iframe contents are empty, then you will need to wait for all resources to finish loading, before you start grabbing stuff from the page. But this is another issue, if you need an answer on this, I suggest you ask a new question about it, and I will answer there.
Had the same problem for iframes and
phantomjs --web-security=no
helped in my case :]

Resources