extract email from craiglists post - web-scraping

Is there a way to find the email from listing on craigslist without the use of selenium
import requests,re
from bs4 import BeautifulSoup as bs
url='https://newyork.craigslist.org/wch/prk/d/hawthorne-10x15-drive-up-storage-unit/7122801839.html' #example url
headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
res=requests.get(url,headers=headers)
the email changes with each request made (I assume), I tried x=re.findall('(\w{32})',res.text) but it doesn't work

Craigslist fetches the email address by sending a POST request to this special URL:
https://newyork.craigslist.org/contactinfo/nyc/prk/U_ID
The value of this U_ID is 7122801839 in this case (from the URL you provided).
You can replicate this request like this:
from bs4 import BeautifulSoup
import requests
import json
U_ID = "7122801839"
URL = f"https://newyork.craigslist.org/contactinfo/nyc/prk/{U_ID}"
COOKIE_VALUE = "cookie" # Replace this with a valid cookie
HEADERS = {
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Accept':'*/*',
'Accept-Language':'en-us',
'Accept-Encoding':'gzip, deflate, br',
'Host':'newyork.craigslist.org',
'Origin':'https',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Safari/605.1.15',
'Connection':'keep-alive',
'Referer':'https',
'Content-Length':'44816',
'Cookie':COOKIE_VALUE,
'X-Requested-With':'XMLHttpRequest',
}
PAYLOAD = {
'MIME Type':'application/x-www-form-urlencoded; charset=UTF-8',
}
response = requests.request(
method='POST',
url=URL,
headers=HEADERS,
data=PAYLOAD
)
html = json.loads(response.text)['replyContent']
soup = BeautifulSoup(html,'html.parser')
email = soup.find(class_='mailapp').get('href')
email = email.split('?subject')[0].replace('mailto:','')
print(email)
Please note that this code won't work without a cookie, so you will need to copy the cookie from your browser.

Related

Trying to request information from "IDEALISTA webpage" in PYTHON with "requests" and I get <Response [403]> [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 4 months ago.
Improve this question
Its a easy starting project that I'm doing, but the main body would be accesing to the information on the Webpage , so , I don't know if i'm doing something wrong
the starting code is: (to see if it works on Fotocasa Webpage)
import requests
from bs4 import BeautifulSoup
url = 'https://www.fotocasa.es/es/comprar/vivienda/valencia-capital/aire-acondicionado-trastero-ascensor-no-amueblado/161485852/d'
# url = 'https://www.idealista.com/inmueble/97795476/'
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'es,es-ES;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cache-control': 'max-age=0',
'dnt': '1',
'sec-ch-ua': '"Chromium";v="106", "Microsoft Edge";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
}
r = requests.get(url, headers=headers)
print(r)
req = requests.get(url, headers=headers).text
# Now that the content is ready, iterate
# through the content using BeautifulSoup:
soup = BeautifulSoup(req, "html.parser")
# get the information of a given tag
inm = soup.find(class_="re-DetailHeader-propertyTitle").text
print(inm)
You can try and see that with the URL of Fotocasa , works perfectly (gets <Response [200]>) , but with the one from Idealista, doesn't work, (gets <Response [403]>)
the code is:
import requests
from bs4 import BeautifulSoup
# url = 'https://www.fotocasa.es/es/comprar/vivienda/valencia-capital/aire-acondicionado-trastero-ascensor-no-amueblado/161485852/d'
url = 'https://www.idealista.com/inmueble/97795476/'
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'es,es-ES;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cache-control': 'max-age=0',
'dnt': '1',
'sec-ch-ua': '"Chromium";v="106", "Microsoft Edge";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
}
r = requests.get(url, headers=headers)
print(r)
req = requests.get(url, headers=headers).text
# Now that the content is ready, iterate
# through the content using BeautifulSoup:
soup = BeautifulSoup(req, "html.parser")
# get the information of a given tag
inm = soup.find(class_="main-info__title-main").text
print(inm)
Your headers are probably fine, but I think you need to include cookies as well.
One way to replicate your browser's request almost exactly is to go to the network tab on the browser's dev tools, and then copy the request for the page
(you might need to refresh for it to show up - it's the one with the same Request URL as whatever you entered in the address bar)
Then, you can paste the copied cURL somewhere like curlconverter to convert to python code, and then copy paste that into to your code so that you can continue with
soup = BeautifulSoup(response.content, "html.parser")
However, you will have to update the cookies frequently, so it migth be less hassle to use a library or api that can bypass these blocks. For example if you sign up for ScrapingAnt, and then paste your API token to the code below:
scrapingant_api = 'https://api.scrapingant.com/v2/general'
scrapingant_key = 'YOUR_API_TOKEN' # paste here
url_to_Scrape = 'https://www.idealista.com/inmueble/97795476/'
url = f'{scrapingant_api}?url={url_to_Scrape}&x-api-key={scrapingant_key}'
r = requests.get(url)
soup = BeautifulSoup(r.text)
inm = soup.find(class_="main-info__title-main").text
print(inm) # prints "Ático en venta en plaza del Ayuntamiento, 6"
There is a limit to how many requests you can run on the free tier of ScrapingAnt, though; so I suggest also considering selenium if you'll be needing to scrape an unlimited number of times. If you copy the function from this gist, you can simply call it like:
# def linkToSoup_selenium .... # paste function into your code
soup = linkToSoup_selenium('https://www.idealista.com/inmueble/97795476/')
inm = soup.find(class_="main-info__title-main").text
print(inm) # prints "Ático en venta en plaza del Ayuntamiento, 6"
You should ALWAYS check the robots.txt file if you want to scrape a page. Read this: ( https://dan-suciu.medium.com/the-complete-manual-to-legal-ethical-web-scraping-in-2021-3eeae278b334 )
In the case of your 2nd url it seems like the scraping is not allowed - it is blocked. Try the url https://www.idealista.com/robots.txt and see the text, google translates it as:
Misuse has been detected Access has been blocked
Having trouble accessing the site? Contact support
ID: fc1d890d-6ed6-8959-cd68-de965251f89b
IP: xx.xx.xx.xx
All the best,
The idealist team
Regards...

Access Token Meta Data via Solscan API with Python

I try to access the meta data of a solana token via the Solscan API.
The following code works in principle but the API doesn't provide the expected data.
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}
params = {
'token': '24jvtWN7qCf5GQ5MaE7V2R4SUgtRxND1w7hyvYa2PXG6',
}
response = requests.get('https://api.solscan.io/token/meta', headers=headers, params=params)
print(response.content.decode())
It returns:
{"succcess":true,"data":{"holder":1}}
However, I expected the following according to the docs https://public-api.solscan.io/docs/#/Token/get_token_meta:
Any help? Thx!
Tried this with another token and got the full response. It seems like the example SPL is lacking metadata to display.
import requests
from requests.structures import CaseInsensitiveDict
url = "https://public-api.solscan.io/token/meta?tokenAddress=4k3Dyjzvzp8eMZWUXbBCjEvwSkkk59S5iCNLY3QrkX6R"
headers = CaseInsensitiveDict()
headers["accept"] = "application/json"
resp = requests.get(url, headers=headers)
print(resp.status_code)

Why I can't scrape this website even though the html content is visible?

This is the link to scrape
I don't know why but I can't scrape the data after the which contains the entire data in this website.
I used requests_html and beautiful soup but there was no successful results :(
I just gave a basic code.
url = "https://www.bcorporation.net/en-us/find-a-b-corp/search?refinement=countries%3DUnited%20Kingdom&refinement=countries%3DAustralia&refinement=countries%3DBelgium&refinement=countries%3DCanada&refinement=countries%3DChina&refinement=countries%3DDenmark&refinement=countries%3DFrance&refinement=countries%3DGermany&refinement=countries%3DIreland&refinement=countries%3DItaly&refinement=countries%3DNetherlands%20The&refinement=countries%3DMexico&refinement=countries%3DNew%20Zealand&refinement=countries%3DPortugal&refinement=countries%3DSpain&refinement=countries%3DSweden&refinement=countries%3DSwitzerland&refinement=countries%3DUnited%20States"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246 '
}
session = HTMLSession()
r = session.get(url, headers=headers)
r.html.render(sleep=50, timeout=50)
content = r.html.find("ul")
title = r.html.find("div[data-testid = company_name]")
print(title)
looking at the Page the data-testid is company-name not company_name :)
title = r.html.find("div[data-testid=company-name]")
worked for me

Losing information when using BeautifulSoup

I am following the guide of 'Automate the Boring Stuff with Python'
practicing a project called 'Project: “I’m Feeling Lucky” Google Search'
but the CSS selector returns nothing
import requests,sys,webbrowser,bs4,pyperclip
if len(sys.argv) > 1:
address = ' '.join(sys.argv[1:])
else:
address = pyperclip.paste()
res = requests.get('http://google.com/search?q=' + str(address))
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text,"html.parser")
linkElems = soup.select('.r a')
for i in range (5):
webbrowser.open('http://google.com' + linkElems[i].get('href'))**
I already tested the same code in the IDLE shell
It seems that
linkElems = soup.select('.r')
returns nothing
and after I checked the value returned by beautiful soup
soup = bs4.BeautifulSoup(res.text,"html.parser")
I found all class='r' and class='rc' is gone for no reason.
But they were there in the raw HTML file.
Please tell me why and how to avoid such problems
To get version of HTML where it's defined class r, it's necessary to set User-Agent in headers:
import requests
from bs4 import BeautifulSoup
address = 'linux'
headers={'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0'}
res = requests.get('http://google.com/search?q=' + str(address), headers=headers)
res.raise_for_status()
soup = BeautifulSoup(res.text,"html.parser")
linkElems = soup.select('.r a')
for a in linkElems:
if a.text.strip() == '':
continue
print(a.text)
Prints:
Linux.orghttps://www.linux.org/
Puhverdatud
Tõlgi see leht
Linux – Vikipeediahttps://et.wikipedia.org/wiki/Linux
Puhverdatud
Sarnased
Linux - Wikipediahttps://en.wikipedia.org/wiki/Linux
...and so on.
The reason why Google blocks your request is because default requests user-agent is python-requests. Check what's your user-agent thus blocking your request and resulting in completely different HTML with different elements and selectors. But sometimes you can receive a different HTML, with different selectors when using user-agent.
Learn more about user-agent and HTTP request headers.
Pass user-agent into request headers:
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
requests.get('YOUR_URL', headers=headers)
Try to use lxml parser instead, it's faster.
Code and full example in the online IDE:
from bs4 import BeautifulSoup
import requests
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "My query goes here"
}
html = requests.get('https://www.google.com/search', headers=headers, params=params)
soup = BeautifulSoup(html.text, 'lxml')
for result in soup.select('.tF2Cxc'):
link = result.select_one('.yuRUbf a')['href']
print(link)
-----
'''
https://dev.mysql.com/doc/refman/8.0/en/entering-queries.html
https://www.benlcollins.com/spreadsheets/google-sheets-query-sql/
https://www.exoscale.com/syslog/explaining-mysql-queries/
https://blog.hubspot.com/marketing/sql-tutorial-introduction
https://mode.com/sql-tutorial/sql-sub-queries/
https://www.mssqltips.com/sqlservertip/1255/getting-io-and-time-statistics-for-sql-server-queries/
https://stackoverflow.com/questions/2698401/how-to-store-mysql-query-results-in-another-table
https://www.khanacademy.org/computing/computer-programming/sql/relational-queries-in-sql/a/more-efficient-sql-with-query-planning-and-optimization
http://cidrdb.org/cidr2011/Papers/CIDR11_Paper7.pdf
https://www.sommarskog.se/query-plan-mysteries.html
'''
Alternatively, you can do the same thing by using Google Organic Results API from SerpApi. It's a paid API with a free plan.
The difference in your case is that you only need to extract the data you want from JSON string rather than figuring out how to extract, maintain or bypass blocks from Google.
Code to integrate:
params = {
"engine": "google",
"q": "My query goes here",
"hl": "en",
"api_key": os.getenv("API_KEY"),
}
search = GoogleSearch(params)
results = search.get_dict()
for result in results["organic_results"]:
print(result['link'])
-------
'''
https://dev.mysql.com/doc/refman/8.0/en/entering-queries.html
https://www.benlcollins.com/spreadsheets/google-sheets-query-sql/
https://www.exoscale.com/syslog/explaining-mysql-queries/
https://blog.hubspot.com/marketing/sql-tutorial-introduction
https://mode.com/sql-tutorial/sql-sub-queries/
https://www.mssqltips.com/sqlservertip/1255/getting-io-and-time-statistics-for-sql-server-queries/
https://stackoverflow.com/questions/2698401/how-to-store-mysql-query-results-in-another-table
https://www.khanacademy.org/computing/computer-programming/sql/relational-queries-in-sql/a/more-efficient-sql-with-query-planning-and-optimization
http://cidrdb.org/cidr2011/Papers/CIDR11_Paper7.pdf
https://www.sommarskog.se/query-plan-mysteries.html
'''
Disclaimer, I work for SerpApi.

How to login with python requests: a tricky case

I am trying to access to a website provided by my school, in order to automate the process of checking if a something has been published (such as marks, news, etc.).
I analized the page's html code and searched for all the input tags (even hidden tags) and put them in a dictionary. I wrote this code:
import requests, lxml.html
from bs4 import BeautifulSoup as bs
login_url = "https://www.portaleargo.it/argoweb/famiglia/common/login_form2.jsp"
whenloggedin_url = "https://www.portaleargo.it/argoweb/famiglia/index.jsf#"
def try_conn(sch_code, user, password):
with requests.Session() as s:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
site = s.get(login_url, headers = headers)
bs_content = bs(site.content, "html.parser")
token = bs_content.find("input", {"id":"cod_utente"})["value"]
login_data = {
"codice_scuola":sch_code,
"utente":user,
"j_password":password,
"cod_utente":token,
}
login = s.post(login_url, data = login_data)
#Proof that it logged in correctly
if login.url == whenloggedin_url:
return True
return False
The function returns False. I also tried to print login.status_code (and it returns 200). I really cannot say why this is not working. What should I do?
I think this should work.
import requests
post_url= "https://www.portaleargo.it/argoweb/famiglia/common/j_security_check"
login_url = "https://www.portaleargo.it/argoweb/famiglia/common/login_form2.jsp"
whenloggedin_url = "https://www.portaleargo.it/argoweb/famiglia/index.jsf#"
def try_conn(sch_code, user, password):
with requests.Session() as s:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
site = s.get(login_url, headers = headers)
login_data = {
"udente":user,
"j_password":password,
"j_username":user+"#"+sch_code,
"submit":"Entra",
}
login = s.post(post_url, data = login_data)
#Proof that it logged in correctly
if login.url == whenloggedin_url:
return True
return False

Resources