I want to be able to convert Japanese kanji to their readings, e.g. 日本語 to nihongo or にほんご.
Google translate website allows you to do that - https://translate.google.com/?sl=ja&tl=en&text=%E6%97%A5%E6%9C%AC%E8%AA%9E&op=translate.
You can see that it shows Nihongo for 日本語 input.
But I can't find any API in Google Cloud Translate that allows to do the same from my app.
Maybe google don't offer that as an API, and some other service allows you to do that, but can't find such service either.
Google offers Translation API which is fit to your requirement.
Using this documentation to replicate your scenario in python:
20220314.py:
import argparse
# [START translate_detect_language]
def detect_language(text):
"""Detects the text's language."""
from google.cloud import translate_v2 as translate
translate_client = translate.Client()
# Text can also be a sequence of strings, in which case this method
# will return a sequence of results for each text.
result = translate_client.detect_language(text)
print("Text: {}".format(text))
print("Confidence: {}".format(result["confidence"]))
print("Language: {}".format(result["language"]))
# [END translate_detect_language]
# [START translate_list_codes]
def list_languages():
"""Lists all available languages."""
from google.cloud import translate_v2 as translate
translate_client = translate.Client()
results = translate_client.get_languages()
for language in results:
print(u"{name} ({language})".format(**language))
# [END translate_list_codes]
# [START translate_list_language_names]
def list_languages_with_target(target):
"""Lists all available languages and localizes them to the target language.
Target must be an ISO 639-1 language code.
See https://g.co/cloud/translate/v2/translate-reference#supported_languages
"""
from google.cloud import translate_v2 as translate
translate_client = translate.Client()
results = translate_client.get_languages(target_language=target)
for language in results:
print(u"{name} ({language})".format(**language))
# [END translate_list_language_names]
# [START translate_text_with_model]
def translate_text_with_model(target, text, model="nmt"):
"""Translates text into the target language.
Make sure your project is allowlisted.
Target must be an ISO 639-1 language code.
See https://g.co/cloud/translate/v2/translate-reference#supported_languages
"""
from google.cloud import translate_v2 as translate
translate_client = translate.Client()
if isinstance(text, bytes):
text = text.decode("utf-8")
# Text can also be a sequence of strings, in which case this method
# will return a sequence of results for each text.
result = translate_client.translate(text, target_language=target, model=model)
print(u"Text: {}".format(result["input"]))
print(u"Translation: {}".format(result["translatedText"]))
print(u"Detected source language: {}".format(result["detectedSourceLanguage"]))
# [END translate_text_with_model]
# [START translate_translate_text]
def translate_text(target, text):
"""Translates text into the target language.
Target must be an ISO 639-1 language code.
See https://g.co/cloud/translate/v2/translate-reference#supported_languages
"""
import six
from google.cloud import translate_v2 as translate
translate_client = translate.Client()
if isinstance(text, six.binary_type):
text = text.decode("utf-8")
# Text can also be a sequence of strings, in which case this method
# will return a sequence of results for each text.
result = translate_client.translate(text, target_language=target)
print(u"Text: {}".format(result["input"]))
print(u"Translation: {}".format(result["translatedText"]))
print(u"Detected source language: {}".format(result["detectedSourceLanguage"]))
# [END translate_translate_text]
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
subparsers = parser.add_subparsers(dest="command")
detect_langage_parser = subparsers.add_parser(
"detect-language", help=detect_language.__doc__
)
detect_langage_parser.add_argument("text")
list_languages_parser = subparsers.add_parser(
"list-languages", help=list_languages.__doc__
)
list_languages_with_target_parser = subparsers.add_parser(
"list-languages-with-target", help=list_languages_with_target.__doc__
)
list_languages_with_target_parser.add_argument("target")
translate_text_parser = subparsers.add_parser(
"translate-text", help=translate_text.__doc__
)
translate_text_parser.add_argument("target")
translate_text_parser.add_argument("text")
args = parser.parse_args()
if args.command == "detect-language":
detect_language(args.text)
elif args.command == "list-languages":
list_languages()
elif args.command == "list-languages-with-target":
list_languages_with_target(args.target)
elif args.command == "translate-text":
translate_text(args.target, args.text)
Output:
(venv) a#cloudshell:~/python_cases$ python 20220314.py translate-text en 日本語
Text: 日本語
Translation: Japanese
Detected source language: ja
Instead of nihongo in Detected source language, it displays the ISO language code: ja for japanese. For the list of language and its ISO language code you can also use this python script:
(venv) a#cloudshell:~/python_cases$ python 20220314.py list-languages
Afrikaans (af)
Albanian (sq)
Amharic (am)
Arabic (ar)
Armenian (hy)
Azerbaijani (az)
Basque (eu)
Belarusian (be)
Bengali (bn)
Bosnian (bs)
Bulgarian (bg)
Catalan (ca)
Cebuano (ceb)
Chichewa (ny)
Chinese (Simplified) (zh-CN)
Chinese (Traditional) (zh-TW)
Corsican (co)
Croatian (hr)
Czech (cs)
Danish (da)
Dutch (nl)
English (en)
Esperanto (eo)
Estonian (et)
Filipino (tl)
Finnish (fi)
French (fr)
Frisian (fy)
Galician (gl)
Georgian (ka)
German (de)
Greek (el)
Gujarati (gu)
Haitian Creole (ht)
Hausa (ha)
Hawaiian (haw)
Hebrew (iw)
Hindi (hi)
Hmong (hmn)
Hungarian (hu)
Icelandic (is)
Igbo (ig)
Indonesian (id)
Irish (ga)
Italian (it)
Japanese (ja)
Javanese (jw)
Kannada (kn)
Kazakh (kk)
Khmer (km)
Kinyarwanda (rw)
Korean (ko)
Kurdish (Kurmanji) (ku)
Kyrgyz (ky)
Lao (lo)
Latin (la)
Latvian (lv)
Lithuanian (lt)
Luxembourgish (lb)
Macedonian (mk)
Malagasy (mg)
Malay (ms)
Malayalam (ml)
Maltese (mt)
Maori (mi)
Marathi (mr)
Mongolian (mn)
Myanmar (Burmese) (my)
Nepali (ne)
Norwegian (no)
Odia (Oriya) (or)
Pashto (ps)
Persian (fa)
Polish (pl)
Portuguese (pt)
Punjabi (pa)
Romanian (ro)
Russian (ru)
Samoan (sm)
Scots Gaelic (gd)
Serbian (sr)
Sesotho (st)
Shona (sn)
Sindhi (sd)
Sinhala (si)
Slovak (sk)
Slovenian (sl)
Somali (so)
Spanish (es)
Sundanese (su)
Swahili (sw)
Swedish (sv)
Tajik (tg)
Tamil (ta)
Tatar (tt)
Telugu (te)
Thai (th)
Turkish (tr)
Turkmen (tk)
Ukrainian (uk)
Urdu (ur)
Uyghur (ug)
Uzbek (uz)
Vietnamese (vi)
Welsh (cy)
Xhosa (xh)
Yiddish (yi)
Yoruba (yo)
Zulu (zu)
Hebrew (he)
Chinese (Simplified) (zh)
Related
I'm trying to retrieve the links of a Google Scholar user's work from their profile but am having trouble accessing the html that is hidden behind the "show more" button. I would like to be able to capture all the links from a user but currently can only get the first 20. Im using the following script to scrape for reference.
from bs4 import BeautifulSoup
import requests
author_url = 'https://scholar.google.com/citations?hl=en&user=mG4imMEAAAAJ'
html_content = requests.get(author_url)
soup = BeautifulSoup(html_content.text, 'lxml')
tables = soup.final_all('table)
table = tables[1]
rows = table.final_all('tr')
links = []
for row in rows:
t = row.find('a')
if t is not None:
links.append(t.get('href'))
You need to use cstart URL parameter which stands for page number, 0 is the first page, 10 is the second.. This parameter allows to skip the need to click "show more button" and does the same thing.
This parameter needs to be used in while loop in order to paginate through all articles.
To exist the loop, one of the ways would be to check certain CSS selector such as .gsc_a_e which is assigned to text when no results are present:
The great thing about such approach is that it paginates dynamically, instead of for i in range() which is hard coded and will be broken if certain authors have 20 articles and another has 2550 articles.
On the screenshot above I'm using the SelectorGadget Chrome extension that lets you pick CSS selectors by clicking on certain elements in the browser. It works great if the website is not heavily JS driven.
Keep in mind that at some point you also need to use CAPTCHA solver or proxies. This is only when you need to extract a lot of articles from multiple authors.
Code with the option to save to CSV using pandas and a full example in the online IDE:
import pandas as pd
from bs4 import BeautifulSoup
import requests, lxml, json
def bs4_scrape_articles():
params = {
"user": "mG4imMEAAAAJ", # user-id
"hl": "en", # language
"gl": "us", # country to search from
"cstart": 0, # articles page. 0 is the first page
"pagesize": "100" # articles per page
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
}
all_articles = []
articles_is_present = True
while articles_is_present:
html = requests.post("https://scholar.google.com/citations", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
for article in soup.select("#gsc_a_b .gsc_a_t"):
article_title = article.select_one(".gsc_a_at").text
article_link = f'https://scholar.google.com{article.select_one(".gsc_a_at")["href"]}'
article_authors = article.select_one(".gsc_a_at+ .gs_gray").text
article_publication = article.select_one(".gs_gray+ .gs_gray").text
all_articles.append({
"title": article_title,
"link": article_link,
"authors": article_authors,
"publication": article_publication
})
# this selector is checking for the .class that contains: "There are no articles in this profile."
# example link: https://scholar.google.com/citations?hl=en&user=mG4imMEAAAAJ&cstart=600
if soup.select_one(".gsc_a_e"):
articles_is_present = False
else:
params["cstart"] += 100 # paginate to the next page
print(json.dumps(all_articles, indent=2, ensure_ascii=False))
# pd.DataFrame(data=all_articles).to_csv(f"google_scholar_{params['user']}_articles.csv", encoding="utf-8", index=False)
bs4_scrape_articles()
Outputs (shows only last results as output is 400+ articles):
[
{
"title": "Exponential family sparse coding with application to self-taught learning with text documents",
"link": "https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:LkGwnXOMwfcC",
"authors": "H Lee, R Raina, A Teichman, AY Ng",
"publication": ""
},
{
"title": "Visual and Range Data",
"link": "https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:eQOLeE2rZwMC",
"authors": "S Gould, P Baumstarck, M Quigley, AY Ng, D Koller",
"publication": ""
}
]
If you don't want want to deal with bypassing blocks from Google or maintaining your script, have a look at the Google Scholar Author Articles API.
There's also a scholarly package that can also extract author articles.
Code that shows how to extract all author articles with Google Scholar Author Articles API:
from serpapi import GoogleScholarSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
import os
def serpapi_scrape_articles():
params = {
# https://docs.python.org/3/library/os.html
"api_key": os.getenv("API_KEY"),
"engine": "google_scholar_author",
"hl": "en",
"author_id": "mG4imMEAAAAJ",
"start": "0",
"num": "100"
}
search = GoogleScholarSearch(params)
all_articles = []
articles_is_present = True
while articles_is_present:
results = search.get_dict()
for index, article in enumerate(results["articles"], start=1):
title = article["title"]
link = article["link"]
authors = article["authors"]
publication = article.get("publication")
citation_id = article["citation_id"]
all_articles.append({
"title": title,
"link": link,
"authors": authors,
"publication": publication,
"citation_id": citation_id
})
if "next" in results.get("serpapi_pagination", {}):
# split URL in parts as a dict() and update "search" variable to a new page
search.params_dict.update(dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query)))
else:
articles_is_present = False
print(json.dumps(all_articles, indent=2, ensure_ascii=False))
# pd.DataFrame(data=all_articles).to_csv(f"serpapi_google_scholar_{params['author_id']}_articles.csv", encoding="utf-8", index=False)
serpapi_scrape_articles()
Here is one way of obtaining that data:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from tqdm import tqdm ## if Jupyter notebook: from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
big_df = pd.DataFrame()
headers = {
'accept-language': 'en-US,en;q=0.9',
'x-requested-with': 'XHR',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
s = requests.Session()
s.headers.update(headers)
payload = {'json': '1'}
for x in tqdm(range(0, 500, 100)):
url = f'https://scholar.google.com/citations?hl=en&user=mG4imMEAAAAJ&cstart={x}&pagesize=100'
r = s.post(url, data=payload)
soup = bs(r.json()['B'], 'html.parser')
works = [(x.get_text(), 'https://scholar.google.com' + x.get('href')) for x in soup.select('a') if 'javascript:void(0)' not in x.get('href') and len(x.get_text()) > 7]
df = pd.DataFrame(works, columns = ['Paper', 'Link'])
big_df = pd.concat([big_df, df], axis=0, ignore_index=True)
print(big_df)
Result in terminal:
100%
5/5 [00:03<00:00, 1.76it/s]
Paper Link
0 Latent dirichlet allocation https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:IUKN3-7HHlwC
1 On spectral clustering: Analysis and an algorithm https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:2KloaMYe4IUC
2 ROS: an open-source Robot Operating System https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:u-x6o8ySG0sC
3 Rectifier nonlinearities improve neural network acoustic models https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:gsN89kCJA0AC
4 Recursive deep models for semantic compositionality over a sentiment treebank https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&pagesize=100&citation_for_view=mG4imMEAAAAJ:_axFR9aDTf0C
... ... ...
473 A Sparse Sampling Algorithm for Near-Optimal Planning in Large Markov Decision Processes https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:hMod-77fHWUC
474 On Discrim inative vs. Generative https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:qxL8FJ1GzNcC
475 Game Theory with Restricted Strategies https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:8k81kl-MbHgC
476 Exponential family sparse coding with application to self-taught learning with text documents https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:LkGwnXOMwfcC
477 Visual and Range Data https://scholar.google.com/citations?view_op=view_citation&hl=en&user=mG4imMEAAAAJ&cstart=400&pagesize=100&citation_for_view=mG4imMEAAAAJ:eQOLeE2rZwMC
478 rows × 2 columns
See pandas documentation at https://pandas.pydata.org/docs/
Also Requests docs: https://requests.readthedocs.io/en/latest/
For BeautifulSoup, go to https://beautiful-soup-4.readthedocs.io/en/latest/
And for TQDM visit https://pypi.org/project/tqdm/
I'm trying to scrape this news website "https://inshorts.com/en/read/national" and i'm just getting the results for just the displayed articles, i need all the articles on the website which contain the word (eg."COVID-19"), and don't have to use the "load more" button.
Here's my code which gives the current articles:
import requests
from bs4 import BeautifulSoup
import pandas as pd
dummy_url="https://inshorts.com/en/read/badminton"
data_dummy=requests.get(dummy_url)
soup=BeautifulSoup(data_dummy.content,'html.parser')
urls=["https://inshorts.com/en/read/national"]
news_data_content,news_data_title,news_data_category,news_data_time=[],[],[],[]
for url in urls:
category=url.split('/')[-1]
data=requests.get(url)
soup=BeautifulSoup(data.content,'html.parser')
news_title=[]
news_content=[]
news_category=[]
news_time = []
for headline,article,time in zip(soup.find_all('div', class_=["news-card-title news-right-box"]),
soup.find_all('div',class_=["news-card-content news-right-box"]),
soup.find_all('div', class_ = ["news-card-author-time news-card-author-time-in-title"])):
news_title.append(headline.find('span',attrs={'itemprop':"headline"}).string)
news_content.append(article.find('div',attrs={'itemprop':"articleBody"}).string)
news_time.append(time.find('span', clas=["date"]))
news_category.append(category)
news_data_title.extend(news_title)
news_data_content.extend(news_content)
news_data_category.extend(news_category)
news_data_time.extend(news_time)
df1=pd.DataFrame(news_data_title,columns=["Title"])
df2=pd.DataFrame(news_data_content,columns=["Content"])
df3=pd.DataFrame(news_data_category,columns=["Category"])
df4=pd.DataFrame(news_data_time, columns=["time"])
df=pd.concat([df1,df2,df3,df4],axis=1)
def name():
a = input("File Name: ")
return a
b = name()
df.to_csv(b + ".csv")
You can use this example how to simulate the clicking on Load More button:
import re
import requests
from bs4 import BeautifulSoup
url = "https://inshorts.com/en/read/national"
api_url = "https://inshorts.com/en/ajax/more_news"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
}
# load first page:
html_doc = requests.get(url, headers=headers).text
min_news_id = re.search(r'min_news_id = "([^"]+)"', html_doc).group(1)
pages = 10 # <-- here I limit number of pages to 10
while pages:
soup = BeautifulSoup(html_doc, "html.parser")
# search the soup for your articles here
# ...
# here I just print the headlines:
for headline in soup.select('[itemprop="headline"]'):
print(headline.text)
# load next batch of articles:
data = requests.post(api_url, data={"news_offset": min_news_id}).json()
html_doc = data["html"]
min_news_id = data["min_news_id"]
pages -= 1
Prints news headlines of first 10 pages:
...
Moeen has done some wonderful things in Test cricket: Root
There should be an evolution in player-media relationship: Federer
Swiggy in talks to raise over $500 mn at $10 bn valuation: Reports
Tesla investors urged to reject Murdoch, Kimbal Musk's re-election
Doctor dies on Pune-Mumbai Expressway when rolls of paper fall on his car
2 mothers name newborn girls after Cyclone Gulab in Odisha
100 US citizens, permanent residents waiting to leave Afghanistan
Iran's nuclear programme has crossed all red lines: Israeli PM
I am trying to get the underlying data from the interactive map on this website:https://www.sabrahealth.com/properties
I tried using the Inspect feature on Google Chrome to find the XHR file that would hold the locations of all the points on the map but nothing appeared. Is there another way to extract the location data from this map?
Well, the location data is available to download on their site here. But let's assume you are wanting the actual latitude, longitude values to do some analysis.
The first thing I would do is exactly what you did (look for the XHR). If I can't find anything there, the second thing I always do is search the html for the <script> tags. sometimes the data is "hiding" in there. It takes a little bit more detective work. It doesn't always yield results, but it does in this case.
If you look within the <script> tags, you'll find the relevant json format. Then you can just work with that. It's just a matter of finding it then manipulating the string to get the valid json format, then use json.loads() to feed that in.
import requests
import bs4
import json
url = 'https://www.sabrahealth.com/properties'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
response = requests.get(url, headers=headers)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if 'jQuery.extend(Drupal.settings,' in script.text:
jsonStr = script.text.split('jQuery.extend(Drupal.settings,')[1]
jsonStr = jsonStr.rsplit(');',1)[0]
jsonObj = json.loads(jsonStr)
for each in jsonObj['gmap']['auto1map']['markers']:
name = each['markername']
lat = each['latitude']
lon = each['longitude']
soup = bs4.BeautifulSoup(each['text'], 'html.parser')
prop_type = soup.find('i', {'class':'property-type'}).text.strip()
sub_cat = soup.find('span', {'class':'subcat'}).text.strip()
location = soup.find('span', {'class':'subcat'}).find_next('p').text.split('\n')[0]
print ('Type: %s\nSubCat: %s\nLat: %s\nLon: %s\nLocation: %s\n' %(prop_type, sub_cat, lat, lon, location))
Output:
Type: Senior Housing - Leased
SubCat: Assisted Living
Lat: 38.3309
Lon: -85.862521
Location: Floyds Knobs, Indiana
Type: Skilled Nursing/Transitional Care
SubCat: SNF
Lat: 29.719507
Lon: -99.06649
Location: Bandera, Texas
Type: Skilled Nursing/Transitional Care
SubCat: SNF
Lat: 37.189079
Lon: -77.376015
Location: Petersburg, Virginia
Type: Skilled Nursing/Transitional Care
SubCat: SNF
Lat: 37.759998
Lon: -122.254616
Location: Alameda, California
...
I'm trying to use the below mail function for python3 which is throwing error NameError: name 'file' is not defined which its works perfectly for python2.
I got to know file() is not supported in Python 3 what will be substitute of file.
#!/usr/bin/env python3
from subprocess import Popen, PIPE
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import os
############ File comparison & sendmail part starts here ########
def ps_Mail():
filename = "/tmp/ps_msg"
f = file(filename)
if os.path.exists(filename) and os.path.getsize(filename) > 0:
mailp = Popen(["/usr/sbin/sendmail", "-t", "-oi"], stdin=PIPE)
msg = MIMEMultipart('alternative')
msg['To'] = "sam#seemac.com"
msg['Subject'] = "Uhh!! Unsafe process seen"
msg['From'] = "psCheck#seemac.com"
msg1 = MIMEText(filename.read(), 'text')
msg.attach(msg1)
mailp.communicate(msg.as_string())
ps_Mail()
I have edited your code and this should work, please try this...
There are two key things to change universal_newlines=True and use open() instead of file().
#!/usr/bin/env python3
from subprocess import Popen, PIPE
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import os
############ File comparison & sendmail part starts here ########
def ps_Mail():
filename = "/tmp/ps_msg"
f = open(filename)
if os.path.exists(filename) and os.path.getsize(filename) > 0:
mailp = Popen(["/usr/sbin/sendmail", "-t", "-oi"], stdin=PIPE, universal_newlines=True)
msg = MIMEMultipart('alternative')
msg['To'] = "sam#seemac.com"
msg['Subject'] = "Uhh!! Unsafe process seen"
msg['From'] = "psCheck#seemac.com"
msg1 = MIMEText(filename.read(), 'text')
msg.attach(msg1)
mailp.communicate(msg.as_string())
ps_Mail()
For more details....
What is the difference between using universal_newlines=True (with bufsize=1) and using default arguments with Popen
The default values are: universal_newlines=False (meaning input/output is accepted as bytes, not Unicode strings plus the universal newlines mode handling (hence the name of the parameter though text_mode might have been a better name here) is disabled -- you get binary data as is (unless POSIX layer on Windows messes it up) and bufsize=-1 (meaning the streams are fully buffered -- the default buffer size is used).
universal_newlines=True uses locale.getpreferredencoding(False) character encoding to decode bytes (that may be different from ascii encoding used in your code).
If universal_newlines=False then for line in Robocopy.stdout: iterates over b'\n'-separated lines. If the process uses non-ascii encoding e.g., UTF-16 for its output then even if os.linesep == '\n' on your system; you may get a wrong result. If you want to consume text lines, use the text mode: pass universal_newlines=True or use io.TextIOWrapper(process.stdout) explicitly.
The second version does include universal_newlines and therefore I specify a bufsize.
In general, It is not necessary to specify bufsize if you use universal_newlines (you may but it is not required). And you don't need to specify bufsize in your case. bufsize=1 enables line-bufferred mode (the input buffer is flushed automatically on newlines if you would write to process.stdin) otherwise it is equivalent to the default bufsize=-1.
I just started off with scrapy. I've loaded the page http://www.ikea.com/ae/en/catalog/categories/departments/childrens_ikea/31772/ with scrapy shell [url] and ran response.css(div.productTitle.Floatleft) to get product names but it gives me the following error:
Traceback (most recent call last): File "", line 1, in
NameError: name 'div' is not defined.
How can I fix this?
You have to use string: "div.productTitle.Floatleft". See " "
Now you try to use variable div.
EDIT: to get correct data you have to set User-Agent
Run shell
scrapy shell http://www.ikea.com/ae/en/catalog/categories/departments/childrens_ikea/31772/
In shell you can use web browser to see HTML from server and you will see error message.
view(response)
You get page again using different User-Agent (using url from previous response)
fetch(response.url, headers={'User-Agent': 'Mozilla/5.0'})
response.css('div.productTitle.floatLeft')
BTW: it has to be floatLeft, not Floatleft - see lower f and upper L
EDIT: the same as standalone script (doesn't need project)
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
#allowed_domains = ['http://www.ikea.com']
start_urls = ['http://www.ikea.com/ae/en/catalog/categories/departments/childrens_ikea/31772/']
def parse(self, response):
print('url:', response.url)
all_products = response.css('div.product')
for product in all_products:
title = product.css('div.productTitle.floatLeft ::text').extract()
description = product.css('div.productDesp ::text').extract()
price = product.css('div.price.regularPrice ::text').extract()
price = price[0].strip()
print('item:', title, description, price)
yield {'title': title, 'description': description, 'price': price}
# --- it runs without project and saves in 'output.csv' ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEED_FORMAT': 'csv',
'FEED_URI': 'output.csv',
})
c.crawl(MySpider)
c.start()
Result in file output.csv:
title,description,price
BÖRJA,feeding spoon and baby spoon,Dhs 5.00
BÖRJA,training beaker,Dhs 5.00
KLADD RANDIG,bib,Dhs 9.00
KLADDIG,bib,Dhs 29.00
MATA,4-piece eating set,Dhs 9.00
SMASKA,bowl,Dhs 9.00
SMASKA,plate,Dhs 12.00
SMÅGLI,plate/bowl,Dhs 19.00
STJÄRNBILD,bib,Dhs 19.00