Company name extraction with bert-base-ner: easy way to know which words relate to which? - bert-language-model

Hi I'm trying to extract the full company name from a string description about the company with bert-base-ner. I am also open to trying other methods but I couldn't really find one. The issue is that although it tags the orgs correctly, it tags it by word/token so I can't easily extract the full company name without having to concat and build it myself.
Is there an easier way or model to do this?
Here is my code:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
ner_results = nlp(text1)
print(ner_results)
Here is my output for one text string:
[{'entity': 'B-ORG', 'score': 0.99965024, 'index': 1, 'word': 'Orion', 'start': 0, 'end': 5}, {'entity': 'I-ORG', 'score': 0.99945647, 'index': 2, 'word': 'Metal', 'start': 6, 'end': 11}, {'entity': 'I-ORG', 'score': 0.99943095, 'index': 3, 'word': '##s', 'start': 11, 'end': 12}, {'entity': 'I-ORG', 'score': 0.99939036, 'index': 4, 'word': 'Limited', 'start': 13, 'end': 20}, {'entity': 'B-LOC', 'score': 0.9997398, 'index': 14, 'word': 'Australia', 'start': 78, 'end': 87}]

I have faced a similar issue and solved it by using a better model called "xlm-roberta-large-finetuned-conll03-English" which is much better than the one you're using right now and will render the complete organization's name rather than the broken pieces. Feel free to test out the below-mentioned code which will extract the full organization's list from the document. Accept my answer by clicking on tick button if it founds useful.
from transformers import pipeline
from subprocess import list2cmdline
from pdfminer.high_level import extract_text
import docx2txt
import spacy
from spacy.matcher import Matcher
import time
start = time.time()
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
model_checkpoint = "xlm-roberta-large-finetuned-conll03-english"
token_classifier = pipeline(
"token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
def text_extraction(file):
""""
To extract texts from both pdf and word
"""
if file.endswith(".pdf"):
return extract_text(file)
else:
resume_text = docx2txt.process(file)
if resume_text:
return resume_text.replace('\t', ' ')
return None
# Organisation names extraction
def org_name(file):
# Extract the complete text in the resume
extracted_text = text_extraction(file)
classifier = token_classifier(extracted_text)
# Get the list of dictionary with key value pair "entity":'ORG'
values = [item for item in classifier if item["entity_group"] == "ORG"]
# Get the list of dictionary with key value pair "entity":'ORG'
res = [sub['word'] for sub in values]
final1 = list(set(res)) # Remove duplicates
final = list(filter(None, final1)) # Remove empty strings
print(final)
org_name("your file name")
end = time.time()
print("The time of execution of above program is :", round((end - start), 2))

Related

How can I use Google Cloud Functions to run a web scraper?

Thanks in advance for your help.
I'm currently running a webscraper - this is the first time I've ever done something like this - It pulls addresses from the URL and then will match the address to the users input. This will be going into a chat bot, I wondering how I can make this run on Google Functions. Whats the process to do this, is there a tutorial anywhere?
This is my code so far. There is a small items file too
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from ..items import DataItem
from fuzzywuzzy import fuzz
from urllib.parse import urljoin
import scrapy
class AddressesSpider(scrapy.Spider):
name = 'Addresses'
allowed_domains = ['find-energy-certificate.service.gov.uk']
postcode = "bh10+4ah"
start_urls = ['https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode=' + postcode]
## def start_requests(self):
## self.first = input("Please enter the address you would like to match: ")
## yield scrapy.Request(url=self.start_urls[0], callback=self.parse)
def parse(self, response):
first = input("Please enter the address you would like to match: ")
highest_ratios = []
highest_item = None
for row in response.xpath('//table[#class="govuk-table"]//tr'):
address = row.xpath("normalize-space(.//a[#class='govuk-link']/text())").extract()[0].lower()
address = address.rsplit(',', 2)[0]
link = row.xpath('.//a[#class="govuk-link"]/#href').extract()
details = row.xpath("normalize-space(.//td/following-sibling::td)").extract()
ratio = fuzz.token_set_ratio(address, first)
item = DataItem()
item['link'] = link
item['details'] = details
item['address'] = address
item['ratioresult'] = ratio
if len(highest_ratios) < 3:
highest_ratios.append(item)
elif ratio > min(highest_ratios, key=lambda x: x['ratioresult'])['ratioresult']:
highest_ratios.remove(min(highest_ratios, key=lambda x: x['ratioresult']))
highest_ratios.append(item)
highest_ratios_100 = [item for item in highest_ratios if item['ratioresult'] == 100]
if highest_ratios_100:
for item in highest_ratios_100:
yield item
else:
yield max(highest_ratios, key=lambda x: x['ratioresult'])
if len(highest_ratios_100) > 1:
for i, item in enumerate(highest_ratios_100):
print(f"{i+1}: {item['address']}")
selected = int(input("Please select the correct address by entering the number corresponding to the address: ")) - 1
selected_item = highest_ratios_100[selected]
else:
selected_item = highest_ratios_100[0] if highest_ratios_100 else max(highest_ratios, key=lambda x: x['ratioresult'])
new_url = selected_item['link'][0]
new_url = str(new_url)
if new_url:
base_url = 'https://find-energy-certificate.service.gov.uk'
print(f'Base URL: {base_url}')
print(f'New URL: {new_url}')
new_url = urljoin(base_url, new_url)
print(f'Combined URL: {new_url}')
yield scrapy.Request(new_url, callback=self.parse_new_page)
def parse_new_page(self, response):
Postcode = response.xpath('normalize-space((//p[#class="epc-address govuk-body"]/text())[last()])').extract()
Town = response.xpath('normalize-space((//p[#class="epc-address govuk-body"]/text())[last()-1])').extract()
First = response.xpath(".//p[#class='epc-address govuk-body']").extract()
Type = response.xpath('normalize-space(//dd[1]/text())').extract_first()
Walls = response.xpath("//th[contains(text(), 'Wall')]/following-sibling::td[1]/text()").extract()
Roof = response.xpath("//th[contains(text(), 'Roof')]/following-sibling::td[1]/text()").extract()
Heating = response.xpath("//th[text()='Main heating']/following-sibling::td[1]/text()").extract_first()
CurrentScore = response.xpath('//body[1]/div[2]/main[1]/div[1]/div[3]/div[3]/svg[1]/svg[1]/text[1]/text()').re_first("[0-9+]{1,2}")
Maxscore = response.xpath('//body[1]/div[2]/main[1]/div[1]/div[3]/div[3]/svg[1]/svg[2]/text[1]/text()').re_first("[0-9+]{2}")
Expiry = response.xpath('normalize-space(//b)').extract_first()
FloorArea = response.xpath('//dt[contains(text(), "floor area")]/following-sibling::dd/text()').re_first("[0-9+]{2,3}")
Steps = response.xpath("//h3[contains(text(),'Step')]/text()").extract()
yield {
'Postcode': Postcode,
'Town': Town,
'First': First,
'Type': Type,
'Walls': Walls,
'Roof': Roof,
'Heating': Heating,
'CurrentScore': CurrentScore,
'Maxscore': Maxscore,
'Expiry': Expiry,
'FloorArea': FloorArea,
'Steps': Steps
}
I've tried googling and having a look around and can't get how to deploy this as a project to run on google functions or can I just copy the code into the console somewhere?
You can try running your spider from a script. However, a better solution would be to wrap scrapy in its own child process.
For example:
from multiprocessing import Process, Queue
from ... import MySpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def my_cloud_function(event, context):
def script(queue):
try:
settings = get_project_settings()
settings.setdict({
'LOG_LEVEL': 'ERROR',
'LOG_ENABLED': True,
})
process = CrawlerProcess(settings)
process.crawl(MySpider)
process.start()
queue.put(None)
except Exception as e:
queue.put(e)
queue = Queue()
# wrap the spider in a child process
main_process = Process(target=script, args=(queue,))
main_process.start() # start the process
main_process.join() # block until the spider finishes
result = queue.get() # check the process did not return an error
if result is not None:
raise result
return 'ok'
You can refer to this tutorial for more info.

Web Scraping: How do I return specific user input forms in python?

I'm having trouble with the forms returning an exact match for the user input.
Emphasoft developer challenge:
Taking a list of tax form names (ex: "Form W-2", "Form 1095-C"),
search the website and return some informational results.
Specifically, you must return the "Product Number", the "Title", and
the maximum and minimum years the form is available for download.
Taking a tax form name (ex: "Form W-2") and a range of years
(inclusive, 2018-2020 should fetch three years), download all PDFs
available within that range.
import json import os import sys import requests from bs4 import BeautifulSoup
URL = 'https://apps.irs.gov/app/picklist/list/priorFormPublication.html?resultsPerPage=200&sortColumn=sortOrder&indexOfFirstRow=0&{param.strip}&isDescending=false'
def get_forms(list_tax_form: list):
"""
function to get response from iris.gov with all forms content
:param list_tax_form: list of form names that we want to get info about
:return: dict with form name,form title
"""
response_list = [] # list for all responses of form names
with requests.session() as session:
for param in list_tax_form:
request_params = {'value': param,
'criteria': 'formNumber',
'submitSearch': 'Find',
}
res = session.get(URL, params=request_params).content
response_list.append(res)
return response_list
def parse_responses(list_tax_form: list):
"""
function to get all form names, titles years from previous func return
:param list_tax_form: list of form names that we want to get info about
:return: list of form names, titles, years
"""
responses = get_forms(list_tax_form)
# empty lists to fill them with the received information for all names, years, and titles
td_form_name, td_form_title, td_form_rev_year = [], [], []
for response in responses:
soup = BeautifulSoup(response, 'lxml')
td_name = soup.find_all('td', {'class': 'LeftCellSpacer'})
td_title = soup.find_all('td', {'class': 'MiddleCellSpacer'})
td_rev_year = soup.find_all('td', {'class': 'EndCellSpacer'})
td_form_name.extend(td_name)
td_form_title.extend(td_title)
td_form_rev_year.extend(td_rev_year)
return td_form_name, td_form_title, td_form_rev_year
def format_responses(list_tax_form: list):
"""
function to formate all responses for all forms we got!
1 Task
:param list_tax_form: list of form names that we want to get info about
:return: formated names,links,years
"""
td_names, td_titles, td_years = parse_responses(list_tax_form)
names = [name.text.strip() for name in td_names]
links = [link.find('a')['href'] for link in td_names]
titles = [title.text.strip() for title in td_titles]
years = [int(year.text.strip()) for year in td_years]
set_names = set(names)
final_dict = []
# loop to create dictionary of result information with years of tax form available to download
for name in set_names:
max_year = 0
min_year = max(years)
dict1 = {'form_number': name}
for index, p_name in enumerate(names):
if p_name == name:
if years[index] > max_year:
max_year = years[index]
elif years[index] < min_year:
min_year = years[index]
dict1['form_title'] = titles[index]
dict1['max_year'] = max_year
dict1['min_year'] = min_year
final_dict.append(dict1)
print(json.dumps(final_dict, indent=2))
return names, links, years
def download_files(list_tax_form):
"""
2 Task
Module to download pdf files of form_name that input from user.
:param list_tax_form: list of form names that we want to get info about
:return: message to user of successful create file or either
"""
names, links, years = format_responses(list_tax_form)
form_name = input('enter form name: ')
if form_name in names:
print('form exists. enter years range')
form_year1 = int(input('start year to analysis: '))
form_year2 = int(input('end year to analysis: '))
try:
os.mkdir(form_name)
except FileExistsError:
pass
# indecies to define names range in list of all tax form names
r_index = names.index(form_name) # index of first form_name mention on list
l_index = names.index(form_name) # index of last form_name mention on list
for name in names:
if name == form_name:
r_index += 1
years = years[l_index:r_index]
if form_year1 < form_year2:
range_years = range(form_year1, form_year2 + 1)
for year in range_years:
if year in years:
link = links[years.index(year)]
form_file = requests.get(link, allow_redirects=True)
open(f'{form_name}/{form_name}_{str(year)}.pdf', 'wb').write(form_file.content)
print(f'files saved to {form_name}/ directory!')
else:
print('input correct form name!')
if __name__ == '__main__':
tax_list = sys.argv[1:] # form names
download_files(tax_list)
(ex: "Form W-2" should not return "Form W-2 P")
When this file is ran, it is displaying other unrelated results.
How can I resolve this issue to display only specified user requests?

Instaloader data scraping using specific hashtag and timeframe

I need help using instaloader to data scrape posts from Instagram that include #slowfashion from a specific timeframe.
I want to scrape the visual and textual data from the posts (specifically, the image/s posted, their descriptions, and comments).
from datetime import datetime
from itertools import dropwhile, takewhile
import instaloader
# Use parameters to save diffrent metadata
L = instaloader.Instaloader(download_pictures=True,download_videos=False,download_comments=False,save_metadata=True)
# Login
username = input("Enter your username: ")
L.interactive_login(username=username)
# User Query
search = input("Enter Hashtag: ")
limit = int(input("How many posts to download: "))
# Hashtag object
hashtags = instaloader.Hashtag.from_name(L.context, search).get_posts()
# Download Period
SINCE = datetime(2021, 5, 1)
UNTIL = datetime(2021, 3, 1)
no_of_downloads = 0
for post in takewhile(lambda p: p.date > UNTIL, dropwhile(lambda p: p.date > SINCE, hashtags)):
if no_of_downloads == limit:
break
print(post.date)
L.download_post(post, "#"+search)
no_of_downloads += 1

How to convert words(numbers in words) into integer numbers in scrapy?

I have written this code
import scrapy
class YellowPages(scrapy.Spider):
name = 'yp'
start_urls = [
"https://www.yellowpages.com/search?search_terms=agent&geo_location_terms=Los%20Angeles%2C%20CA&page=1",
]
def parse(self, response):
agent_name = response.xpath("//a[#class='business-name']/span/text()").extract()
phone_number = response.xpath("//div[#class='phones phone primary']/text()").extract()
address = response.xpath("//div[#class='street-address']/text()").extract()
locality = response.xpath("//div[#class='locality']/text()").extract()
data = zip(agent_name, phone_number, address, locality)
for item in data:
info = {
#'page' : response.url,
'Agent name': item[0],
'Phone number': item[1],
'Address': item[2],
'Locality':item[3],
}
yield info
next_page_href = response.xpath('//a[#class= "next ajax-page"]/#href').extract()[0]
next_page = "https://www.yellowpages.com"+next_page_href
if next_page is not None:
yield scrapy.Request(response.urljoin(next_page), callback=self.parse)
But now I want to have ratings added to my CSV file. bt the rating number is written in word.
like this.
<div class="result-rating three ">
On the webpage this rating is shown by stars and the number of the total stars is written in word in the code.
I want to get that rating in number. Anyone know how will I able extract the words into numbers??
Assuming the rating is from one to five, you can maintain an array of these words (one to five) and detect them in the string.
Something like this:
word_number_mapping = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5}
rating_value = None
rating_text = response.css('.result-rating::attr(class)').extract()
if rating_text:
for k, v in word_number_mapping.items():
if k in rating_text:
rating_value = v + 0.5 if 'half' in rating_text else v
Hope it helps.

Why can't I access information in tbody?

[This is the source code of the website][1]I am doing web scraping with BeautifulSoup but cannot find tr in tbody; there actually is tr in tbody in the source code of the website; however, the find_all function can only return the tr in thead.
link I am scraping on: https://cpj.org/data/killed/?status=Killed&motiveConfirmed%5B%5D=Confirmed&type%5B%5D=Journalist&start_year=1992&end_year=2019&group_by=year
Here are some of my code:
```from bs4 import BeautifulSoup
```url = "https://cpj.org/data/killed/?status=Killed&motiveConfirmed%5B%5D=Confirmed&type%5B%5D=Journalist&start_year=1992&end_year=2019&group_by=year"
```html = urlopen(url)
```soup = BeautifulSoup(html,'lxml')
```type(soup)
```tr = soup.find_all("tr")
```print(tr)
[1]: https://i.stack.imgur.com/NFwEV.png
Data is requested via API returning json i.e. it is dynamically added so it doesn't appear in your request to the landing page. You can find the API endpoint in the network tab which is used to get the info.
You can alter one of the parameters to a number larger than expected result set then check if you need to make further requests.
import requests
r = requests.get('https://cpj.org/api/datamanager/reports/entries?distinct(personId)&includes=organizations,fullName,location,status,typeOfDeath,charges,startDisplay,mtpage,country,type,motiveConfirmed&sort=fullName&pageNum=1&pageSize=2000&in(status,%27Killed%27)&or(eq(type,%22media%20worker%22),in(motiveConfirmed,%27Confirmed%27))&in(type,%27Journalist%27)&ge(year,1992)&le(year,2019)').json()
Otherwise, you can do an initial call and verify how many more requests to make and alter the appropriate paramters in the url. You can see the pageCount is returned.
You can see relevant parts in response here for pagesize 20:
{'rowCount': 1343,
'pageNum': 1,
'pageSize': '20',
'pageCount': 68,
All the relevant info for a loop to get all results is there.
After altering to larger number you can see the following:
'rowCount': 1343,
'pageNum': 1,
'pageSize': '2000',
'pageCount': 1,
You can convert to a table using pandas:
import requests
import pandas as pd
r = requests.get('https://cpj.org/api/datamanager/reports/entries?distinct(personId)&includes=organizations,fullName,location,status,typeOfDeath,charges,startDisplay,mtpage,country,type,motiveConfirmed&sort=fullName&pageNum=1&pageSize=2000&in(status,%27Killed%27)&or(eq(type,%22media%20worker%22),in(motiveConfirmed,%27Confirmed%27))&in(type,%27Journalist%27)&ge(year,1992)&le(year,2019)').json()
df = pd.DataFrame(r['data'])
print(df)
Sample of df:
Example of checking actual count and make additional request for request of records:
import requests
import pandas as pd
request_number = 1000
with requests.Session() as s:
r = s.get('https://cpj.org/api/datamanager/reports/entries?distinct(personId)&includes=organizations,fullName,location,status,typeOfDeath,charges,startDisplay,mtpage,country,type,motiveConfirmed&sort=fullName&pageNum=1&pageSize=' + str(request_number) + '&in(status,%27Killed%27)&or(eq(type,%22media%20worker%22),in(motiveConfirmed,%27Confirmed%27))&in(type,%27Journalist%27)&ge(year,1992)&le(year,2019)').json()
df = pd.DataFrame(r['data'])
actual_number = r['rowCount']
if actual_number > request_number:
request_number = actual_number - request_number
r = s.get('https://cpj.org/api/datamanager/reports/entries?distinct(personId)&includes=organizations,fullName,location,status,typeOfDeath,charges,startDisplay,mtpage,country,type,motiveConfirmed&sort=fullName&pageNum=2&pageSize=' + str(request_number) + '&in(status,%27Killed%27)&or(eq(type,%22media%20worker%22),in(motiveConfirmed,%27Confirmed%27))&in(type,%27Journalist%27)&ge(year,1992)&le(year,2019)').json()
df2 = pd.DataFrame(r['data'])
final = pd.concat([df,df2])
else:
final = df
To get the tabular content using the selectors you see by inspecting elements, you can try using this pyppeteer which I've shown below how to work with. The following approach is an asynchronous one. So, I suggest you to go for this unless you find any api to play with:
import asyncio
from pyppeteer import launch
url = "https://cpj.org/data/killed/?status=Killed&motiveConfirmed%5B%5D=Confirmed&type%5B%5D=Journalist&start_year=1992&end_year=2019&group_by=year"
async def get_table(link):
browser = await launch(headless=False)
[page] = await browser.pages()
await page.goto(link)
await page.waitForSelector("table.js-report-builder-table tr td")
for tr in await page.querySelectorAll("table.js-report-builder-table tr"):
tds = [await page.evaluate('e => e.innerText',td) for td in await tr.querySelectorAll("th,td")]
print(tds)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(get_table(url))
Output are like:
['Name', 'Organization', 'Date', 'Location', 'Attack', 'Type of Death', 'Charge']
['Abadullah Hananzai', 'Radio Azadi,Radio Free Europe/Radio Liberty', 'April 30, 2018', 'Afghanistan', 'Killed', 'Murder', '']
['Abay Hailu', 'Agiere', 'February 9, 1998', 'Ethiopia', 'Killed', 'Dangerous Assignment', '']
['Abd al-Karim al-Ezzo', 'Freelance', 'December 21, 2012', 'Syria', 'Killed', 'Crossfire', '']
['Abdallah Bouhachek', 'Révolution et Travail', 'February 10, 1996', 'Algeria', 'Killed', 'Murder', '']
['Abdel Aziz Mahmoud Hasoun', 'Masar Press', 'September 5, 2013', 'Syria', 'Killed', 'Crossfire', '']
['Abdel Karim al-Oqda', 'Shaam News Network', 'September 19, 2012', 'Syria', 'Killed', 'Murder', '']

Resources