Column not iterable, PySpark - vector

I'm trying to perform a count vectorization using this function I've created however, I keep having an error returned stating "column not iterable" which I cannot figure out why and how to resolve it.
(dfNew is the data frame with just two of the columns, both of StringType)
import string
import re
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
ps = PorterStemmer()
dfNew = df.select(F.col('Description'), F.col('ID'))
def clean_description(text):
text = "".join([word.lower() for word in text if word not in string.punctuation])
text = re.sub('[\n\r]+', ' ', text).lower()
Description = re.split('\W+', text)
text = [ps.stem(word) for word in Description if word not in nltk.corpus.stopwords.words('english')]
more_stop_words = ['please', 'search', 'submitted', 'thank', 'search', 'com', 'url', 'https', 'via', 'www']
text = [ps.stem(word) for word in Description if word not in more_stop_words]
return text
count_vectorize = CountVectorizer(analyzer=clean_description)
vectorized = count_vectorize.fit_transform(dfNew['Description'])
What am I doing wrong, and how can this be resolved?

Related

How do I make it so that, these blank spaces are filled up with the next image, instead of opening up another page

How do I make it so that, these blank spaces are filled up with the next image, instead of opening up another page, is there some sort of if statement for docx, which goes like
this is what I mean, visually..
if (space)
-> insert image there
My code:
import docx
from PIL import Image
import os, os.path
from docx.enum.section import WD_ORIENT
from docx.enum.section import WD_SECTION
mydoc = docx.Document()
imgs = []
path1 = "J:/BBIS-G8/Biology/"
valid_images = [".jpg",".gif",".png",".tga"]
for f in os.listdir(path1):
a = 0
ext = os.path.splitext(f)[1]
if ext.lower() not in valid_images:
continue
a += 1
file1 = os.path.join(path1,f)
# print(file1)
imgs.append(Image.open(os.path.join(path1,f)))
current_section = mydoc.sections[0]
new_width, new_height = current_section.page_height, current_section.page_width
new_section = mydoc.add_section(WD_SECTION.NEW_PAGE)
new_section.orientation = WD_ORIENT.LANDSCAPE
new_section.page_width = new_width
new_section.page_height = new_height
mydoc.add_picture(file1, width=docx.shared.Inches(4), height=docx.shared.Inches(5))
mydoc.save("J:/biology.docx")

How to import data from a HTML table on a website to excel?

I would like to do some statistical analysis with Python on the live casino game called Crazy Time from Evolution Gaming. There is a website that has the data to do this: https://tracksino.com/crazytime. I want the data of the lowest table 'Spin History' to be imported into excel. However, I do not now how this can be done. Could anyone give me an idea where to start?
Thanks in advance!
Try the below code:
import json
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import csv
import datetime
def scrap_history():
csv_headers = []
file_path = '' #mention your system where you have to save the file
file_name = 'spin_history.csv' # filename
page_number = 1
while True:
#Dynamic URL fetching data in chunks of 100
url = 'https://api.tracksino.com/crazytime_history?filter=&sort_by=&sort_desc=false&page_num=' + str(page_number) + '&per_page=100&period=24hours'
print('-' * 100)
print('URL created : ',url)
response = requests.get(url,verify=False)
result = json.loads(response.text) # loading data to convert in JSON.
history_data = result['data']
print(history_data)
if history_data != []:
with open(file_path + file_name ,'a+') as history:
#Headers for file
csv_headers = ['Occured At','Slot Result','Spin Result','Total Winners','Total Payout',]
csvwriter = csv.DictWriter(history, delimiter=',', lineterminator='\n',fieldnames=csv_headers)
if page_number == 1:
print('Writing CSV header now...')
csvwriter.writeheader()
#write exracted data in to csv file one by one
for item in history_data:
value = datetime.datetime.fromtimestamp(item['when'])
occured_at = f'{value:%d-%B-%Y # %H:%M:%S}'
csvwriter.writerow({'Occured At':occured_at,
'Slot Result': item['slot_result'],
'Spin Result': item['result'],
'Total Winners': item['total_winners'],
'Total Payout': item['total_payout'],
})
print('-' * 100)
page_number +=1
print(page_number)
print('-' * 100)
else:
break
Explanation:
I have implemented the above script using python requests way. The API url https://api.tracksino.com/crazytime_history?filter=&sort_by=&sort_desc=false&page_num=1&per_page=50&period=24hours extarcted from the web site itself(refer screenshot). In the very first step script will take the dynamic URL where page number is dynamic and changed upon on every iteration. For ex:- first it will be page_num = 1 then page_num = 2 and so on till all the data will get extracted.

How to fix python returning multiple lines in a .csv document instead of one?

I am trying to scrape data form a public forum for a school project, but every-time I run the code, the resulting .csv file shows multiple rows for the text variable instead of just one.
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
my_url = 'https://www.emimino.cz/diskuse/1ivf-repromeda-56566/'
uClient = uReq(my_url)
page_soup = soup(uClient.read(),"html.parser")
uClient.close()
containers = page_soup.findAll("div",{"class":"discussion_post"})
out_filename = "Repromeda.csv"
headers = "text,user_name,date \n"
f = open(out_filename, "w")
f.write(headers)
for container in containers:
text1 = container.div.p
text = text1.text
user_container = container.findAll("span",{"class":"user_category"})
user_id = user_container[0].text
date_container = container.findAll("span",{"class":"date"})
date = date_container[1].text
print("text: " + text + "\n" )
print("user_id: " + user_id + "\n")
print("date: " + date + "\n")
# writes the dataset to file
f.write(text.replace(",", "|") + ", " + user_id + ", " + date + "\n")
f.close()
Ideally I am trying to create a row for each data entry (ie. text, user_id, date in one row), but instead I get multiple rows for one text entry and only one row for user_id and date entry.
this is the actual output
this is the expected output
Just replace the new line with blank string.
for container in containers:
text1 = container.div.p
text = text1.text.replace('\n', ' ')

Python ArcPy - Print Layer with highest field value

I have some python code that goes through layers in my ArcGIS project and prints out the layer names and their corresponding highest value within the field "SUM_USER_VisitCount".
Output Picture
What I want the code to do is only print out the layer name and SUM_USER_VisitCount field value for the one layer with the absolute highest value.
Desired Output
I have been unable to figure out how to achieve this and can't find anything online either. Can someone help me achieve my desired output?
Sorry if the code layout is a little weird. It got messed up when I pasted it into the "code sample"
Here is my code:
import arcpy
import datetime
from datetime import timedelta
import time
#Document Start Time in-order to calculate Run Time
time1 = time.clock()
#assign project and map frame
p =
arcpy.mp.ArcGISProject(r'E:\arcGIS_Shared\Python\CumulativeHeatMaps.aprx')
m = p.listMaps('Map')[0]
Markets = [3000]
### Centers to loop through
CA_Centers = ['Castro', 'ColeValley', 'Excelsior', 'GlenPark',
'LowerPacificHeights', 'Marina', 'NorthBeach', 'RedwoodCity', 'SanBruno',
'DalyCity']
for Market in Markets:
print(Market)
for CA_Center in CA_Centers:
Layers =
m.listLayers("CumulativeSumWithin{0}_{1}_Jun2018".format(Market,CA_Center))
fields = ['SUM_USER_VisitCount']
for Layer in Layers:
print(Layer)
sqlClause = (None, 'ORDER BY ' + 'SUM_USER_VisitCount') # + 'DESC'
with arcpy.da.SearchCursor(in_table = Layer, field_names = fields,
sql_clause = sqlClause) as searchCursor:
print (max(searchCursor))
You can create a dictonary that stores the results from each query and then print out the highest one at the end.
results_dict = {}
for Market in Markets:
print(Market)
for CA_Center in CA_Centers:
Layers =
m.listLayers("CumulativeSumWithin{0}_{1}_Jun2018".format(Market,CA_Center))
fields = ['SUM_USER_VisitCount']
for Layer in Layers:
print(Layer)
sqlClause = (None, 'ORDER BY ' + 'SUM_USER_VisitCount') # + 'DESC'
with arcpy.da.SearchCursor(in_table = Layer, field_names = fields,
sql_clause = sqlClause) as searchCursor:
print (max(searchCursor))
results_dict[Layer] = max(searchCursor)
# get key for dictionary item with the highest value
highest_count_layer = max(results_dict, key=results_dict.get)
print(highest_count_layer)
print(results_dict[highest_count_layer])

Groovy GroupBy field with and without white spaces

I have invoices list as below
def invoices = [
'LEDES98BI V2',
'LINE|INVOICE_DATE|INVOICE_NUMBER|INVOICE_TOTAL',
'1|20150301|INV-Error_Test1|22',
'2|20150301|INV-Error_Test1|24',
'3|20150301|INV-Error_Test2|26',
'4|20150301|INV-Error_Test2|28,']
I am trying to do groupBy on the above collection with INVOICE_NUMBER and trying to achieve map with INVOICE_NUMBER and lines as values, below code does it
def lines = invoices*.split('\\|').findAll{ it.size()>1 }
def heads = lines.first()
def invoiceMap = lines.tail().collect{ [heads, it].transpose().collectEntries() }.groupBy{ it.INVOICE_NUMBER }
If I print invoiceMap I get what I intended as below map
[INV-Error_Test1:[[LINE:1, INVOICE_DATE:20150301, INVOICE_NUMBER:INV-Error_Test1, INVOICE_TOTAL:22],
[LINE:2, INVOICE_DATE:20150301, INVOICE_NUMBER:INV-Error_Test1, INVOICE_TOTAL:24]],
INV-Error_Test2:[[LINE:3, INVOICE_DATE:20150301, INVOICE_NUMBER:INV-Error_Test2, INVOICE_TOTAL:26],
[LINE:4, INVOICE_DATE:20150301, INVOICE_NUMBER:INV-Error_Test2, INVOICE_TOTAL:28,]]
]
But but if the INVOICE_NUMBER has any white spaces with it in the invoices map my code doesnt work. Can someone help me to make my code work with white spaces on INVOICE_NUMBER?
Use a proper CSV parser, rather than rolling your own.
#Grab('com.xlson.groovycsv:groovycsv:1.0')
import static com.xlson.groovycsv.CsvParser.parseCsv
def invoices = [
'LEDES98BI V2',
'LINE|INVOICE_DATE|INVOICE_NUMBER|INVOICE_TOTAL',
'1|20150301|INV-Error_Test1|22',
'2|20150301|INV-Error_Test1|24',
'3|20150301|INV-Error_Test2|26',
'4|20150301|INV-Error_Test2|28,']
def data = parseCsv(invoices.drop(1).join('\n'), separator:'|')
def invoiceMap = data.collect().groupBy { it.INVOICE_NUMBER }
Or with a space in the column title:
def invoices = [
'LEDES98BI V2',
'LINE|INVOICE_DATE|INVOICE NUMBER|INVOICE_TOTAL',
'1|20150301|INV-Error_Test1|22',
'2|20150301|INV-Error_Test1|24',
'3|20150301|INV-Error_Test2|26',
'4|20150301|INV-Error_Test2|28,']
def data = parseCsv(invoices.drop(1).join('\n'), separator:'|')
def invoiceMap = data.collect().groupBy { it.'INVOICE NUMBER' }
You just need to quote your name, like this
def invoiceMap = lines.tail().collect{ [heads, it].transpose().collectEntries() }.groupBy{ it.'INVOICE NUMBER' }

Resources