module error in mechanize python - mechanize

I am using mechanize python to login a website combochat2.us with username mask3 and pwd findnext, but it's showing an error like "no module found mechanize"
import cookielib
import urllib2
import mechanize
# Browser
br = mechanize.Browser()
# Enable cookie support for urllib2
cookiejar = cookielib.LWPCookieJar()
br.set_cookiejar( cookiejar )
# Broser options
br.set_handle_equiv( True )
br.set_handle_gzip( True )
br.set_handle_redirect( True )
br.set_handle_referer( True )
br.set_handle_robots( False )
# ??
br.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time = 1 )
br.addheaders = [ ( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' ) ]
# authenticate
br.open(http://combochat2.us)
br.select_form( name="combochat" )
# these two come from the code you posted
# where you would normally put in your username and password
br[ "mask3" ] = yourLogin
br[ "findnext" ] = yourPassword
res = br.submit()
print "Success!\n"
I already installed the mechanize module.

mechanize supports only python 2.
Refer https://stackoverflow.com/a/31774959/4773973 for alternatives

I do know how you installed Mechanize but try:
pip install mechanize
I had an issue fixed installing Mechanize this way.

Related

How to scrape url links when the website takes us to a splash screen?

import requests
from bs4 import BeautifulSoup
import re
R = []
url = "https://ascscotties.com/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; ' \
'Intel Mac OS X 10.6; rv:16.0) Gecko/20100101 Firefox/16.0'}
reqs = requests.get(url, headers=headers)
soup = BeautifulSoup(reqs.text, 'html.parser')
links= soup.find_all('a',href=re.compile("roster"))
s=[url + link.get("href") for link in links]
for i in s:
r = requests.get(i, allow_redirects=True, headers=headers)
if r.status_code < 400:
R.append(r.url)
Output
['https://ascscotties.com/sports/womens-basketball/roster',
'https://ascscotties.com/sports/womens-cross-country/roster',
'https://ascscotties.com/sports/womens-soccer/roster',
'https://ascscotties.com/sports/softball/roster',
'https://ascscotties.com/sports/womens-tennis/roster',
'https://ascscotties.com/sports/womens-volleyball/roster']
The code looks for roster links from url's and gives output, but like "https://auyellowjackets.com/" it fails as the url takes use to a splash screen. What can be done?
The site uses a cookie to indicate it has shown a splash screen before. So set it to get to the main page:
import re
import requests
from bs4 import BeautifulSoup
R = []
url = "https://auyellowjackets.com"
cookies = {"splash_2": "splash_2"} # <--- set cookie
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; "
"Intel Mac OS X 10.6; rv:16.0) Gecko/20100101 Firefox/16.0"
}
reqs = requests.get(url, headers=headers, cookies=cookies)
soup = BeautifulSoup(reqs.text, "html.parser")
links = soup.find_all("a", href=re.compile("roster"))
s = [url + link.get("href") for link in links]
for i in s:
r = requests.get(i, allow_redirects=True, headers=headers)
if r.status_code < 400:
R.append(r.url)
print(*R, sep="\n")
Prints:
https://auyellowjackets.com/sports/mens-basketball/roster
https://auyellowjackets.com/sports/mens-cross-country/roster
https://auyellowjackets.com/sports/football/roster
https://auyellowjackets.com/sports/mens-track-and-field/roster
https://auyellowjackets.com/sports/mwrest/roster
https://auyellowjackets.com/sports/womens-basketball/roster
https://auyellowjackets.com/sports/womens-cross-country/roster
https://auyellowjackets.com/sports/womens-soccer/roster
https://auyellowjackets.com/sports/softball/roster
https://auyellowjackets.com/sports/womens-track-and-field/roster
https://auyellowjackets.com/sports/volleyball/roster

I want to go to the all the pages of yelp webiste and extract data from

I want to go to all the pages of the yelp site but cann't
this is the code
# packages
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
import urllib
import os
import json
import datetime
import csv
# property scraper class
class Yelp(scrapy.Spider):
# scraper name
name = 'home business'
base_url = 'https://www.yelp.com/search?'
params = {
'find_desc': 'Home Cleaning',
'find_loc':'North Dallas, Dallas, TX',
#'start' : ''
}
page = 0
current_page = 1
# headers
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
}
#params['start'] = page
try:
os.remove('abx.csv')
except OSError:
pass
# custom settings
custom_settings = {
'CONCURRENT_REQUEST_PER_DOMAIN': 2,
'DOWNLOAD_DELAY': 1
}
# general crawler
def start_requests(self):
url = self.base_url + urllib.parse.urlencode(self.params)
# initial HTTP request
yield scrapy.Request(
url=url,
headers=self.headers,
callback=self.parse_listing
)
def parse_listing(self, response):
lists = response.css('h4[class="css-1l5lt1i"]')
for link in lists:
link = link.css('a::attr(href)').get()
link = 'https://www.yelp.com/' + link
#print('\n\nlink:',link,'\n\n')
yield response.follow(link, headers = self.headers, callback = self.parse_cards)
break
try:
#self.params['start'] = self.page
try:
total_pages = response.css('.text-align--center__09f24__1P1jK .css-e81eai::text').get()[5:7]
print(total_pages)
self.page +=10
self.current_page +=1
except Exception as e:
total_pages = 1
print('totl:',total_pages)
print('PAGE %s | %s ' % (self.current_page, total_pages))
if int(self.page/10) <= int(total_pages):
self.log('\n\n %s | %s\n\n ' %(self.page/10, total_pages))
next_page = response.url + '&start=' + str(self.page)
yield response.follow(url = next_page, headers = self.headers, callback = self.parse_listing)
except:
print('only single page',self.current_page)
def parse_cards(self,response):
print('\nok\n')
# main driver
if __name__ == '__main__':
# run scraper
process = CrawlerProcess()
process.crawl(Yelp)
process.start()
#Yelp.parse_cards(Yelp, '')
I applied try and except method also but cann't done the job.
The main problem is in the next page with the param '&start=' if i increment the start to 10 in every time then the url become every time like this
'https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX&start=10&start=20&start=30'
and so on i want to only the url start will increment to start=10 and after them start=20 and so on.
like this
'https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX&start=20'
'https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX&start=30'
and so on.
Just find the link to the next page and follow that
next_page = response.css("a.next-link::attr(href)").get()
if next_page:
yield response.follow(next_page, callback=self.parse)
This is pretty similar to what is done in the scrapy tutorial, have you followed that? Was there a reason you couldn't do it this way?
In the end your entire spider can become
from scrapy import Spider
class Yelp(Spider):
# scraper name
name = "home business"
start_urls = [
"https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX"
]
def parse(self, response):
for link in response.css("h4 > span > a"):
yield response.follow(link, callback=self.parse_cards)
next_page = response.css("a.next-link::attr(href)").get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_cards(self, response):
print("parse_cards", response.url)
I removed the start_requests stuff to keep it simple for this example (something you should probably try to do when asking questions)

How to change user agent when Tor ip changes in Scrapy

I use Tor and Privoxy with TorIpChanger to change ip after a random number of items_scraped. And it is working fine. I would like to change user-agent as well, when ip changes.
I am a bit confused about the way to go to achieve this. I have had a look at scrapy_useragents and similar solutions looking for inspiration, without a lot of success for now. This is what i'm trying to do, based on https://github.com/khpeek/scraper-compose/ and https://docs.scrapy.org/en/latest/topics/extensions.html
extensions.py
class TorRenewIdentity(object):
def __init__(self, crawler, item_count, user_agents):
self.crawler = crawler
self.item_count = self.randomize(item_count) # Randomize the item count to confound traffic analysis
self._item_count = item_count # Also remember the given item count for future randomizations
self.items_scraped = 0
self.user_agents = user_agents
# Connect the extension object to signals
self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
#staticmethod
def randomize(item_count, min_factor=0.5, max_factor=1.5):
'''Randomize the number of items scraped before changing identity. (A similar technique is applied to Scrapy's DOWNLOAD_DELAY setting).'''
randomized_item_count = random.randint(int(min_factor*item_count), int(max_factor*item_count))
logger.info("The crawler will scrape the following (randomized) number of items before changing identity (again): {}".format(randomized_item_count))
return randomized_item_count
#classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('TOR_RENEW_IDENTITY_ENABLED'):
raise NotConfigured
item_count = crawler.settings.getint('TOR_ITEMS_TO_SCRAPE_PER_IDENTITY', 10)
user_agents = crawler.settings['USER_AGENT']
return cls(crawler=crawler, item_count=item_count, user_agents=user_agents) # Instantiate the extension object
def item_scraped(self, item, spider):
'''When item_count items are scraped, pause the engine and change IP address.'''
self.items_scraped += 1
if self.items_scraped == self.item_count:
logger.info("Scraped {item_count} items. Pausing engine while changing identity...".format(item_count=self.item_count))
self.crawler.engine.pause()
ip_changer.get_new_ip() # Change IP address with toripchanger https://github.com/DusanMadar/TorIpChanger
self.items_scraped = 0 # Reset the counter
self.item_count = self.randomize(self._item_count) # Generate a new random number of items to scrape before changing identity again
# Get new user agent from list
if self.user_agents:
new_user_agent = random.choice(self.user_agents)
logger.info('Load {} user_agents from settings. New user agent is {}.'.format(
len(self.user_agents) if self.user_agents else 0, new_user_agent))
# Change user agent here ?
# For next self.item_count items
# headers.setdefault('User-Agent', new_user_agent)
#
self.crawler.engine.unpause()
settings.py
USER_AGENT = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:56.0) Gecko/20100101 Firefox/56.0'
]
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'scrapydevua.middlewares.ScrapydevuaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'scrapydevua.middlewares.ScrapydevuaDownloaderMiddleware': 543,
#}
EXTENSIONS = {
'scrapydevua.extensions.TorRenewIdentity': 1,
}

I have scrapy script, but I can not scrape data, don't knew why

I run the script, but I got none, but there are data on the url
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
class GetSpider(scrapy.Spider):
name = 'gets'
start_urls = ['https://www.retailmenot.com/coupons/insurance?u=ZTF65B5PJZEU3JDF326WY2SXOQ']
def parse(self, response):
s = Selector(response)
code = s.xpath("//button[contains(#class,'CopyCode')][1]/text()").get()
yield {'code':code}
I expect 52YR, but i got None
The easiest way to go about this is probably to load the json in the script as a python dictionary and navigate through it to get to the codes.
The below code should get you started:
import scrapy
import json
import logging
class GetSpider(scrapy.Spider):
name = 'gets'
start_urls = ['https://www.retailmenot.com/coupons/insurance?u=ZTF65B5PJZEU3JDF326WY2SXOQ']
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
}
custom_settings = {'ROBOTSTXT_OBEY': False}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url,
callback=self.parse,
headers=self.headers,
dont_filter=True)
def parse(self, response):
script = response.xpath(
'//script[contains(text(), "__NEXT_DATA__")]/text()'
).extract_first()
dict_start_index = script.index('{')
dict_end_index = script.index('};') + 1
data = json.loads(script[dict_start_index:dict_end_index])
coupon_data = data['props']['pageProps']['serverState']['apollo']['data']
for key, value in coupon_data.items():
try:
code = value['code']
except KeyError:
logging.debug("no code found")
else:
yield {'code': code}

How to download JIRA attachment files with Python

I want to download attachment files of an issue in JIRA Python.
I use jira python lib ,you can use pip install JIRA
# -- coding: UTF-8 --
from jira import JIRA
import requests
url = 'https://jira.1234.com'
jira = JIRA(server=url, basic_auth=('admin', 'password'))
attachment=jira.attachment(12345) #12345 is attachment_key
image = attachment.get()
with open("Image.png", 'wb') as f:
f.write(image)
JIRA exposes its REST services and through that and some python you can download any attachment.
It worked for me like this (you'll need to adjust the variables):
#!/usr/bin/python
# miguel ortiz
# Requests module: http://docs.python-requests.org/en/latest/
# Documentation: <url>
#----------------------------------------------------------------Modules
import sys
import csv, json
import requests
#----------------------------------------------------------------Variables
myTicket= sys.argv[1] # Your ticket: ABC-123
user = 'miguel' # JIRA user
pasw = 'password' # JIRA password
jiraURL = 'https://yourinstance.jira.com/rest/api/latest/issue/'
fileName = 'my_attached_file' # In this case we'll be looking for a specific file in the attachments
attachment_final_url="" # To validate if there are or not attachments
def main() :
print '\n\n [ You are checking ticket: ' + myTicket+ ' ]\n'
# Request Json from JIRA API
r = requests.get(jiraURL+myTicket, auth=(user, pasw),timeout=5)
# status of the request
rstatus = r.status_code
# If the status isn't 200 we leave
if not rstatus == 200 :
print 'Error accesing JIRA:' + str(rstatus)
exit()
else:
data = r.json()
if not data['fields']['attachment'] :
status_attachment = 'ERROR: Nothing attached, attach a file named: ' + fileName
attachment_final_url=""
else:
for i in data['fields']['attachment'] :
if i['filename'] == fileName :
attachment_final_url = i['content']
status_attachment_name = 'OK: The desired attachment exists: ' + fileName
attachment_name = False
attachment_amount = False
attachment_files = False
break
else :
attachment_files = False
status_attachment_name = + 'ERROR: None of the files has the desired name '
attachment_final_url=""
attachment_name = True
attachment_amount = True
continue
if attachment_final_url != "" :
r = requests.get(attachment_final_url, auth=(user, pasw), stream=True)
with open(fileName, "wb") as f:
f.write(r.content.decode('iso-8859-1').encode('utf8'))
f.close()
else:
print status_attachment
if __name__ == "__main__" :
main()
If you do not understand the code I've detailed it better in my blog.
EDIT: Be careful, in JIRA you can add many files with the same name.

Resources