I have invoices list as below
def invoices = [
'LEDES98BI V2',
'LINE|INVOICE_DATE|INVOICE_NUMBER|INVOICE_TOTAL',
'1|20150301|INV-Error_Test1|22',
'2|20150301|INV-Error_Test1|24',
'3|20150301|INV-Error_Test2|26',
'4|20150301|INV-Error_Test2|28,']
I am trying to do groupBy on the above collection with INVOICE_NUMBER and trying to achieve map with INVOICE_NUMBER and lines as values, below code does it
def lines = invoices*.split('\\|').findAll{ it.size()>1 }
def heads = lines.first()
def invoiceMap = lines.tail().collect{ [heads, it].transpose().collectEntries() }.groupBy{ it.INVOICE_NUMBER }
If I print invoiceMap I get what I intended as below map
[INV-Error_Test1:[[LINE:1, INVOICE_DATE:20150301, INVOICE_NUMBER:INV-Error_Test1, INVOICE_TOTAL:22],
[LINE:2, INVOICE_DATE:20150301, INVOICE_NUMBER:INV-Error_Test1, INVOICE_TOTAL:24]],
INV-Error_Test2:[[LINE:3, INVOICE_DATE:20150301, INVOICE_NUMBER:INV-Error_Test2, INVOICE_TOTAL:26],
[LINE:4, INVOICE_DATE:20150301, INVOICE_NUMBER:INV-Error_Test2, INVOICE_TOTAL:28,]]
]
But but if the INVOICE_NUMBER has any white spaces with it in the invoices map my code doesnt work. Can someone help me to make my code work with white spaces on INVOICE_NUMBER?
Use a proper CSV parser, rather than rolling your own.
#Grab('com.xlson.groovycsv:groovycsv:1.0')
import static com.xlson.groovycsv.CsvParser.parseCsv
def invoices = [
'LEDES98BI V2',
'LINE|INVOICE_DATE|INVOICE_NUMBER|INVOICE_TOTAL',
'1|20150301|INV-Error_Test1|22',
'2|20150301|INV-Error_Test1|24',
'3|20150301|INV-Error_Test2|26',
'4|20150301|INV-Error_Test2|28,']
def data = parseCsv(invoices.drop(1).join('\n'), separator:'|')
def invoiceMap = data.collect().groupBy { it.INVOICE_NUMBER }
Or with a space in the column title:
def invoices = [
'LEDES98BI V2',
'LINE|INVOICE_DATE|INVOICE NUMBER|INVOICE_TOTAL',
'1|20150301|INV-Error_Test1|22',
'2|20150301|INV-Error_Test1|24',
'3|20150301|INV-Error_Test2|26',
'4|20150301|INV-Error_Test2|28,']
def data = parseCsv(invoices.drop(1).join('\n'), separator:'|')
def invoiceMap = data.collect().groupBy { it.'INVOICE NUMBER' }
You just need to quote your name, like this
def invoiceMap = lines.tail().collect{ [heads, it].transpose().collectEntries() }.groupBy{ it.'INVOICE NUMBER' }
Related
Thanks in advance for your help.
I'm currently running a webscraper - this is the first time I've ever done something like this - It pulls addresses from the URL and then will match the address to the users input. This will be going into a chat bot, I wondering how I can make this run on Google Functions. Whats the process to do this, is there a tutorial anywhere?
This is my code so far. There is a small items file too
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from ..items import DataItem
from fuzzywuzzy import fuzz
from urllib.parse import urljoin
import scrapy
class AddressesSpider(scrapy.Spider):
name = 'Addresses'
allowed_domains = ['find-energy-certificate.service.gov.uk']
postcode = "bh10+4ah"
start_urls = ['https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode=' + postcode]
## def start_requests(self):
## self.first = input("Please enter the address you would like to match: ")
## yield scrapy.Request(url=self.start_urls[0], callback=self.parse)
def parse(self, response):
first = input("Please enter the address you would like to match: ")
highest_ratios = []
highest_item = None
for row in response.xpath('//table[#class="govuk-table"]//tr'):
address = row.xpath("normalize-space(.//a[#class='govuk-link']/text())").extract()[0].lower()
address = address.rsplit(',', 2)[0]
link = row.xpath('.//a[#class="govuk-link"]/#href').extract()
details = row.xpath("normalize-space(.//td/following-sibling::td)").extract()
ratio = fuzz.token_set_ratio(address, first)
item = DataItem()
item['link'] = link
item['details'] = details
item['address'] = address
item['ratioresult'] = ratio
if len(highest_ratios) < 3:
highest_ratios.append(item)
elif ratio > min(highest_ratios, key=lambda x: x['ratioresult'])['ratioresult']:
highest_ratios.remove(min(highest_ratios, key=lambda x: x['ratioresult']))
highest_ratios.append(item)
highest_ratios_100 = [item for item in highest_ratios if item['ratioresult'] == 100]
if highest_ratios_100:
for item in highest_ratios_100:
yield item
else:
yield max(highest_ratios, key=lambda x: x['ratioresult'])
if len(highest_ratios_100) > 1:
for i, item in enumerate(highest_ratios_100):
print(f"{i+1}: {item['address']}")
selected = int(input("Please select the correct address by entering the number corresponding to the address: ")) - 1
selected_item = highest_ratios_100[selected]
else:
selected_item = highest_ratios_100[0] if highest_ratios_100 else max(highest_ratios, key=lambda x: x['ratioresult'])
new_url = selected_item['link'][0]
new_url = str(new_url)
if new_url:
base_url = 'https://find-energy-certificate.service.gov.uk'
print(f'Base URL: {base_url}')
print(f'New URL: {new_url}')
new_url = urljoin(base_url, new_url)
print(f'Combined URL: {new_url}')
yield scrapy.Request(new_url, callback=self.parse_new_page)
def parse_new_page(self, response):
Postcode = response.xpath('normalize-space((//p[#class="epc-address govuk-body"]/text())[last()])').extract()
Town = response.xpath('normalize-space((//p[#class="epc-address govuk-body"]/text())[last()-1])').extract()
First = response.xpath(".//p[#class='epc-address govuk-body']").extract()
Type = response.xpath('normalize-space(//dd[1]/text())').extract_first()
Walls = response.xpath("//th[contains(text(), 'Wall')]/following-sibling::td[1]/text()").extract()
Roof = response.xpath("//th[contains(text(), 'Roof')]/following-sibling::td[1]/text()").extract()
Heating = response.xpath("//th[text()='Main heating']/following-sibling::td[1]/text()").extract_first()
CurrentScore = response.xpath('//body[1]/div[2]/main[1]/div[1]/div[3]/div[3]/svg[1]/svg[1]/text[1]/text()').re_first("[0-9+]{1,2}")
Maxscore = response.xpath('//body[1]/div[2]/main[1]/div[1]/div[3]/div[3]/svg[1]/svg[2]/text[1]/text()').re_first("[0-9+]{2}")
Expiry = response.xpath('normalize-space(//b)').extract_first()
FloorArea = response.xpath('//dt[contains(text(), "floor area")]/following-sibling::dd/text()').re_first("[0-9+]{2,3}")
Steps = response.xpath("//h3[contains(text(),'Step')]/text()").extract()
yield {
'Postcode': Postcode,
'Town': Town,
'First': First,
'Type': Type,
'Walls': Walls,
'Roof': Roof,
'Heating': Heating,
'CurrentScore': CurrentScore,
'Maxscore': Maxscore,
'Expiry': Expiry,
'FloorArea': FloorArea,
'Steps': Steps
}
I've tried googling and having a look around and can't get how to deploy this as a project to run on google functions or can I just copy the code into the console somewhere?
You can try running your spider from a script. However, a better solution would be to wrap scrapy in its own child process.
For example:
from multiprocessing import Process, Queue
from ... import MySpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def my_cloud_function(event, context):
def script(queue):
try:
settings = get_project_settings()
settings.setdict({
'LOG_LEVEL': 'ERROR',
'LOG_ENABLED': True,
})
process = CrawlerProcess(settings)
process.crawl(MySpider)
process.start()
queue.put(None)
except Exception as e:
queue.put(e)
queue = Queue()
# wrap the spider in a child process
main_process = Process(target=script, args=(queue,))
main_process.start() # start the process
main_process.join() # block until the spider finishes
result = queue.get() # check the process did not return an error
if result is not None:
raise result
return 'ok'
You can refer to this tutorial for more info.
I'm having trouble with the forms returning an exact match for the user input.
Emphasoft developer challenge:
Taking a list of tax form names (ex: "Form W-2", "Form 1095-C"),
search the website and return some informational results.
Specifically, you must return the "Product Number", the "Title", and
the maximum and minimum years the form is available for download.
Taking a tax form name (ex: "Form W-2") and a range of years
(inclusive, 2018-2020 should fetch three years), download all PDFs
available within that range.
import json import os import sys import requests from bs4 import BeautifulSoup
URL = 'https://apps.irs.gov/app/picklist/list/priorFormPublication.html?resultsPerPage=200&sortColumn=sortOrder&indexOfFirstRow=0&{param.strip}&isDescending=false'
def get_forms(list_tax_form: list):
"""
function to get response from iris.gov with all forms content
:param list_tax_form: list of form names that we want to get info about
:return: dict with form name,form title
"""
response_list = [] # list for all responses of form names
with requests.session() as session:
for param in list_tax_form:
request_params = {'value': param,
'criteria': 'formNumber',
'submitSearch': 'Find',
}
res = session.get(URL, params=request_params).content
response_list.append(res)
return response_list
def parse_responses(list_tax_form: list):
"""
function to get all form names, titles years from previous func return
:param list_tax_form: list of form names that we want to get info about
:return: list of form names, titles, years
"""
responses = get_forms(list_tax_form)
# empty lists to fill them with the received information for all names, years, and titles
td_form_name, td_form_title, td_form_rev_year = [], [], []
for response in responses:
soup = BeautifulSoup(response, 'lxml')
td_name = soup.find_all('td', {'class': 'LeftCellSpacer'})
td_title = soup.find_all('td', {'class': 'MiddleCellSpacer'})
td_rev_year = soup.find_all('td', {'class': 'EndCellSpacer'})
td_form_name.extend(td_name)
td_form_title.extend(td_title)
td_form_rev_year.extend(td_rev_year)
return td_form_name, td_form_title, td_form_rev_year
def format_responses(list_tax_form: list):
"""
function to formate all responses for all forms we got!
1 Task
:param list_tax_form: list of form names that we want to get info about
:return: formated names,links,years
"""
td_names, td_titles, td_years = parse_responses(list_tax_form)
names = [name.text.strip() for name in td_names]
links = [link.find('a')['href'] for link in td_names]
titles = [title.text.strip() for title in td_titles]
years = [int(year.text.strip()) for year in td_years]
set_names = set(names)
final_dict = []
# loop to create dictionary of result information with years of tax form available to download
for name in set_names:
max_year = 0
min_year = max(years)
dict1 = {'form_number': name}
for index, p_name in enumerate(names):
if p_name == name:
if years[index] > max_year:
max_year = years[index]
elif years[index] < min_year:
min_year = years[index]
dict1['form_title'] = titles[index]
dict1['max_year'] = max_year
dict1['min_year'] = min_year
final_dict.append(dict1)
print(json.dumps(final_dict, indent=2))
return names, links, years
def download_files(list_tax_form):
"""
2 Task
Module to download pdf files of form_name that input from user.
:param list_tax_form: list of form names that we want to get info about
:return: message to user of successful create file or either
"""
names, links, years = format_responses(list_tax_form)
form_name = input('enter form name: ')
if form_name in names:
print('form exists. enter years range')
form_year1 = int(input('start year to analysis: '))
form_year2 = int(input('end year to analysis: '))
try:
os.mkdir(form_name)
except FileExistsError:
pass
# indecies to define names range in list of all tax form names
r_index = names.index(form_name) # index of first form_name mention on list
l_index = names.index(form_name) # index of last form_name mention on list
for name in names:
if name == form_name:
r_index += 1
years = years[l_index:r_index]
if form_year1 < form_year2:
range_years = range(form_year1, form_year2 + 1)
for year in range_years:
if year in years:
link = links[years.index(year)]
form_file = requests.get(link, allow_redirects=True)
open(f'{form_name}/{form_name}_{str(year)}.pdf', 'wb').write(form_file.content)
print(f'files saved to {form_name}/ directory!')
else:
print('input correct form name!')
if __name__ == '__main__':
tax_list = sys.argv[1:] # form names
download_files(tax_list)
(ex: "Form W-2" should not return "Form W-2 P")
When this file is ran, it is displaying other unrelated results.
How can I resolve this issue to display only specified user requests?
I'm trying to dynamically create nested map like below in code.
def people = [
[name: 'Ash', age: '21', gender: 'm'],
[name: 'Jo', age: '22', gender: 'f'],
[name: 'etc.', age: '42', gender: 'f']
]
So I can search it like below
person = people.findAll {item ->
item.gender == 'm' &&
item.age == '21'}
My problem is that whilst I can dynamically create one dimensional maps in code, I don't know how to dynamically combine maps in code to create nested map e.g. let's assume in code I have created two maps name1 and name2. How do I add them to people map so they are nested like above example?
def people = [:]
def name1 = [name:'ash', age:'21', gender:'m']
def name2 = [name:'Jo', age:'22', gender:'f']
I've searched / tried so many posts without success. Below is close, but does not work :(
people.put((),(name1))
people.put((),(name2))
In your example, people is a list of maps, not a nested map
So you can simply do:
def people = []
def name1 = [name:'ash', age:'21', gender:'m']
def name2 = [name:'Jo', age:'22', gender:'f']
Then:
people += name1
people += name2
Or define it in one line:
def people = [name1, name2]
I have 2 datastore models:
class KindA(ndb.Model):
field_a1 = ndb.StringProperty()
field_a2 = ndb.StringProperty()
class KindB(ndb.Model):
field_b1 = ndb.StringProperty()
field_b2 = ndb.StringProperty()
key_to_kind_a = ndb.KeyProperty(KindA)
I want to query KindB and output it to a csv file, but if an entity of KindB points to an entity in KindA I want those fields to be present in the csv as well.
If I was able to use ndb inside of a transform I would setup my pipeline like this
def format(element): # element is an `entity_pb2` object of KindB
try:
obj_a_key_id = element.properties.get('key_to_kind_a', None).key_value.path[0]
except:
obj_a_key_id = None
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<< HOW DO I DO THIS
obj_a = ndb.Key(KindA, obj_a_key_id).get() if obj_a_key_id else None
return ",".join([
element.properties.get('field_b1', None).string_value,
element.properties.get('field_b2', None).string_value,
obj_a.properties.get('field_a1', None).string_value if obj_a else '',
obj_a.properties.get('field_a2', None).string_value if obj_a else '',
]
def build_pipeline(project, start_date, end_date, export_path):
query = query_pb2.Query()
query.kind.add().name = 'KindB'
filter_1 = datastore_helper.set_property_filter(query_pb2.Filter(), 'field_b1', PropertyFilter.GREATER_THAN, start_date)
filter_2 = datastore_helper.set_property_filter(query_pb2.Filter(), 'field_b1', PropertyFilter.LESS_THAN, end_date)
datastore_helper.set_composite_filter(query.filter, CompositeFilter.AND, filter_1, filter_2)
p = beam.Pipeline(options=pipeline_options)
_ = (p
| 'read from datastore' >> ReadFromDatastore(project, query, None)
| 'format' >> beam.Map(format)
| 'write' >> apache_beam.io.WriteToText(
file_path_prefix=export_path,
file_name_suffix='.csv',
header='field_b1,field_b2,field_a1,field_a2',
num_shards=1)
)
return p
I suppose I could use ReadFromDatastore to query all entities of KindA and then use CoGroupByKey to merge them, but KindA has millions of records and that would be very inefficient.
Per the reccommendations in this answer: https://stackoverflow.com/a/49130224/4458510
I created the following utils, which were inspired by the source code of
DatastoreWriteFn in apache_beam.io.gcp.datastore.v1.datastoreio
write_mutations and fetch_entities in apache_beam.io.gcp.datastore.v1.helper
import logging
import time
from socket import error as _socket_error
from apache_beam.metrics import Metrics
from apache_beam.transforms import DoFn, window
from apache_beam.utils import retry
from apache_beam.io.gcp.datastore.v1.adaptive_throttler import AdaptiveThrottler
from apache_beam.io.gcp.datastore.v1.helper import make_partition, retry_on_rpc_error, get_datastore
from apache_beam.io.gcp.datastore.v1.util import MovingSum
from apache_beam.utils.windowed_value import WindowedValue
from google.cloud.proto.datastore.v1 import datastore_pb2, query_pb2
from googledatastore.connection import Datastore, RPCError
_WRITE_BATCH_INITIAL_SIZE = 200
_WRITE_BATCH_MAX_SIZE = 500
_WRITE_BATCH_MIN_SIZE = 10
_WRITE_BATCH_TARGET_LATENCY_MS = 5000
def _fetch_keys(project_id, keys, datastore, throttler, rpc_stats_callback=None, throttle_delay=1):
req = datastore_pb2.LookupRequest()
req.project_id = project_id
for key in keys:
req.keys.add().CopyFrom(key)
#retry.with_exponential_backoff(num_retries=5, retry_filter=retry_on_rpc_error)
def run(request):
# Client-side throttling.
while throttler.throttle_request(time.time() * 1000):
logging.info("Delaying request for %ds due to previous failures", throttle_delay)
time.sleep(throttle_delay)
if rpc_stats_callback:
rpc_stats_callback(throttled_secs=throttle_delay)
try:
start_time = time.time()
response = datastore.lookup(request)
end_time = time.time()
if rpc_stats_callback:
rpc_stats_callback(successes=1)
throttler.successful_request(start_time * 1000)
commit_time_ms = int((end_time - start_time) * 1000)
return response, commit_time_ms
except (RPCError, _socket_error):
if rpc_stats_callback:
rpc_stats_callback(errors=1)
raise
return run(req)
# Copied from _DynamicBatchSizer in apache_beam.io.gcp.datastore.v1.datastoreio
class _DynamicBatchSizer(object):
"""Determines request sizes for future Datastore RPCS."""
def __init__(self):
self._commit_time_per_entity_ms = MovingSum(window_ms=120000, bucket_ms=10000)
def get_batch_size(self, now):
"""Returns the recommended size for datastore RPCs at this time."""
if not self._commit_time_per_entity_ms.has_data(now):
return _WRITE_BATCH_INITIAL_SIZE
recent_mean_latency_ms = (self._commit_time_per_entity_ms.sum(now) / self._commit_time_per_entity_ms.count(now))
return max(_WRITE_BATCH_MIN_SIZE,
min(_WRITE_BATCH_MAX_SIZE,
_WRITE_BATCH_TARGET_LATENCY_MS / max(recent_mean_latency_ms, 1)))
def report_latency(self, now, latency_ms, num_mutations):
"""Reports the latency of an RPC to Datastore.
Args:
now: double, completion time of the RPC as seconds since the epoch.
latency_ms: double, the observed latency in milliseconds for this RPC.
num_mutations: int, number of mutations contained in the RPC.
"""
self._commit_time_per_entity_ms.add(now, latency_ms / num_mutations)
class LookupKeysFn(DoFn):
"""A `DoFn` that looks up keys in the Datastore."""
def __init__(self, project_id, fixed_batch_size=None):
self._project_id = project_id
self._datastore = None
self._fixed_batch_size = fixed_batch_size
self._rpc_successes = Metrics.counter(self.__class__, "datastoreRpcSuccesses")
self._rpc_errors = Metrics.counter(self.__class__, "datastoreRpcErrors")
self._throttled_secs = Metrics.counter(self.__class__, "cumulativeThrottlingSeconds")
self._throttler = AdaptiveThrottler(window_ms=120000, bucket_ms=1000, overload_ratio=1.25)
self._elements = []
self._batch_sizer = None
self._target_batch_size = None
def _update_rpc_stats(self, successes=0, errors=0, throttled_secs=0):
"""Callback function, called by _fetch_keys()"""
self._rpc_successes.inc(successes)
self._rpc_errors.inc(errors)
self._throttled_secs.inc(throttled_secs)
def start_bundle(self):
"""(re)initialize: connection with datastore, _DynamicBatchSizer obj"""
self._elements = []
self._datastore = get_datastore(self._project_id)
if self._fixed_batch_size:
self._target_batch_size = self._fixed_batch_size
else:
self._batch_sizer = _DynamicBatchSizer()
self._target_batch_size = self._batch_sizer.get_batch_size(time.time()*1000)
def process(self, element):
"""Collect elements and process them as a batch"""
self._elements.append(element)
if len(self._elements) >= self._target_batch_size:
return self._flush_batch()
def finish_bundle(self):
"""Flush any remaining elements"""
if self._elements:
objs = self._flush_batch()
for obj in objs:
yield WindowedValue(obj, window.MAX_TIMESTAMP, [window.GlobalWindow()])
def _flush_batch(self):
"""Fetch all of the collected keys from datastore"""
response, latency_ms = _fetch_keys(
project_id=self._project_id,
keys=self._elements,
datastore=self._datastore,
throttler=self._throttler,
rpc_stats_callback=self._update_rpc_stats)
logging.info("Successfully read %d keys in %dms.", len(self._elements), latency_ms)
if not self._fixed_batch_size:
now = time.time()*1000
self._batch_sizer.report_latency(now, latency_ms, len(self._elements))
self._target_batch_size = self._batch_sizer.get_batch_size(now)
self._elements = []
return [entity_result.entity for entity_result in response.found]
class LookupEntityFieldFn(LookupKeysFn):
"""
Looks-up a field on an EntityPb2 object
Expects a EntityPb2 object as input
Outputs a tuple, where the first element is the input object and the second element is the object found during the
lookup
"""
def __init__(self, project_id, field_name, fixed_batch_size=None):
super(LookupEntityFieldFn, self).__init__(project_id=project_id, fixed_batch_size=fixed_batch_size)
self._field_name = field_name
#staticmethod
def _pb2_key_value_to_tuple(kv):
"""Converts a key_value object into a tuple, so that it can be a dictionary key"""
path = []
for p in kv.path:
path.append(p.name)
path.append(p.id)
return tuple(path)
def _flush_batch(self):
_elements = self._elements
keys_to_fetch = []
for element in self._elements:
kv = element.properties.get(self._field_name, None)
if kv and kv.key_value and kv.key_value.path:
keys_to_fetch.append(kv.key_value)
self._elements = keys_to_fetch
read_keys = super(LookupEntityFieldFn, self)._flush_batch()
_by_key = {self._pb2_key_value_to_tuple(entity.key): entity for entity in read_keys}
output_pairs = []
for input_obj in _elements:
kv = input_obj.properties.get(self._field_name, None)
output_obj = None
if kv and kv.key_value and kv.key_value.path:
output_obj = _by_key.get(self._pb2_key_value_to_tuple(kv.key_value), None)
output_pairs.append((input_obj, output_obj))
return output_pairs
The Key to this is the line response = datastore.lookup(request), where:
datastore = get_datastore(project_id) (from apache_beam.io.gcp.datastore.v1.helper.get_datastore)
request is a LookupRequest from google.cloud.proto.datastore.v1.datastore_pb2
response is LookupResponse from google.cloud.proto.datastore.v1.datastore_pb2
The rest of the above code does things like:
using a single connection to the datastore for a dofn bundle
batches keys together before performing a lookup request
throttles interactions with the datastore if requests start to fail
(honestly I don't know how critical these bits are, I just came across them when browsing the apache_beam source code)
The resulting util function LookupEntityFieldFn(project_id, field_name) is a DoFn that takes in an entity_pb2 object as input, extracts and fetches/gets the key_property that resides on the field field_name, and outputs the result as a tuple (the fetch-result is paired with the input object)
My Pipeline code then became
def format(element): # element is a tuple `entity_pb2` objects
kind_b_element, kind_a_element = element
return ",".join([
kind_b_element.properties.get('field_b1', None).string_value,
kind_b_element.properties.get('field_b2', None).string_value,
kind_a_element.properties.get('field_a1', None).string_value if kind_a_element else '',
kind_a_element.properties.get('field_a2', None).string_value if kind_a_element else '',
]
def build_pipeline(project, start_date, end_date, export_path):
query = query_pb2.Query()
query.kind.add().name = 'KindB'
filter_1 = datastore_helper.set_property_filter(query_pb2.Filter(), 'field_b1', PropertyFilter.GREATER_THAN, start_date)
filter_2 = datastore_helper.set_property_filter(query_pb2.Filter(), 'field_b1', PropertyFilter.LESS_THAN, end_date)
datastore_helper.set_composite_filter(query.filter, CompositeFilter.AND, filter_1, filter_2)
p = beam.Pipeline(options=pipeline_options)
_ = (p
| 'read from datastore' >> ReadFromDatastore(project, query, None)
| 'extract field' >> apache_beam.ParDo(LookupEntityFieldFn(project_id=project, field_name='key_to_kind_a'))
| 'format' >> beam.Map(format)
| 'write' >> apache_beam.io.WriteToText(
file_path_prefix=export_path,
file_name_suffix='.csv',
header='field_b1,field_b2,field_a1,field_a2',
num_shards=1)
)
return p
I have this code but don't know how to read the links from a CSV or a list. I want to read the links and scrape details off every single link and then save the data in columns respected to each link into an output CSV.
Here is the code I built to get specific data.
from bs4 import BeautifulSoup
import requests
url = "http://www.ebay.com/itm/282231178856"
r = requests.get(url)
x = BeautifulSoup(r.content, "html.parser")
# print(x.prettify().encode('utf-8'))
# time to find some tags!!
# y = x.find_all("tag")
z = x.find_all("h1", {"itemprop": "name"})
# print z
# for loop done to extracting the title.
for item in z:
try:
print item.text.replace('Details about ', '')
except:
pass
# category extraction done
m = x.find_all("span", {"itemprop": "name"})
# print m
for item in m:
try:
print item.text
except:
pass
# item condition extraction done
n = x.find_all("div", {"itemprop": "itemCondition"})
# print n
for item in n:
try:
print item.text
except:
pass
# sold number extraction done
k = x.find_all("span", {"class": "vi-qtyS vi-bboxrev-dsplblk vi-qty-vert-algn vi-qty-pur-lnk"})
# print k
for item in k:
try:
print item.text
except:
pass
# Watchers extraction done
u = x.find_all("span", {"class": "vi-buybox-watchcount"})
# print u
for item in u:
try:
print item.text
except:
pass
# returns details extraction done
t = x.find_all("span", {"id": "vi-ret-accrd-txt"})
# print t
for item in t:
try:
print item.text
except:
pass
#per hour day view done
a = x.find_all("div", {"class": "vi-notify-new-bg-dBtm"})
# print a
for item in a:
try:
print item.text
except:
pass
#trending at price
b = x.find_all("span", {"class": "mp-prc-red"})
#print b
for item in b:
try:
print item.text
except:
pass
Your question is kind of vague!
Which links are you talking about? There are a hundred on a single ebay page. Which infos would you like to scrape? Similarly there is also a ton.
But anyway, here is I would proceed:
# First, create a list of urls you want to iterate on
urls = []
soup = (re.text, "html.parser")
# Assuming your links of interests are values of "href" attributes within <a> tags
a_tags = soup.find_all("a")
for tag in a_tags:
urls.append(tag["href"])
# Second, start to iterate while storing the info
info_1, info_2 = [], []
for link in urls:
# Do stuff here, maybe its time to define your existing loops as functions?
info_a, info_b = YourFunctionReturningValues(soup)
info_1.append(info_a)
info_2.append(info_b)
Then if you want a nice csv output:
# Don't forget to import the csv module
with open(r"path_to_file.csv", "wb") as my_file:
csv_writer = csv.writer(final_csv, delimiter = ",")
csv_writer.writerows(zip(urls, info_1, info_2, info_3))
Hope this will help?
Of course, don't hesitate to give additional info, so to have additional details
On attributes with BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#attributes
About the csv module: https://docs.python.org/2/library/csv.html