Error in mapping DataFrame for keyword in ipython - dictionary

I have collected tweets and wish to add a column named taliban that maps (true/false) for the word (Taliban/taliban) present in the tweet. Please see the code and its error.
tweets = pd.DataFrame()
from pandas.io.json import json_normalize
tweets = json_normalize(tweet_data)[["text", "lang", "place.country","created_at", \
"coordinates", "user.location"]]
The code to define word is as follows:
#definition for collecting keyword.
def word_in_text(word, text):
word = word.lower()
text = text.lower()
match = re.search(word, text)
if match:
return True
return False
Then I run this query:
tweets['Taliban'] = tweets['text'].apply(lambda tweet: word_in_text('Taliban', tweet))
I get the following error:
AttributeError Traceback (most recent call last)
<ipython-input-16-17bcf7fbf866> in <module>()
----> 1 tweets['Taliban'] = tweets['text'].apply(lambda tweet: word_in_text('Taliban', tweet))
/usr/lib64/python2.7/site-packages/pandas/core/series.pyc in apply(self, func, convert_dtype, args, **kwds)
2167 values = lib.map_infer(values, lib.Timestamp)
2168
-> 2169 mapped = lib.map_infer(values, f, convert=convert_dtype)
2170 if len(mapped) and isinstance(mapped[0], Series):
2171 from pandas.core.frame import DataFrame
pandas/src/inference.pyx in pandas.lib.map_infer (pandas/lib.c:62578)()
<ipython-input-16-17bcf7fbf866> in <lambda>(tweet)
----> 1 tweets['Taliban'] = tweets['text'].apply(lambda tweet: word_in_text('Taliban', tweet))
<ipython-input-15-56228996ad5c> in word_in_text(word, text)
2 def word_in_text(word, text):
3 word = word.lower()
----> 4 text = text.lower()
5 match = re.search(word, text)
6 if match:
AttributeError: 'float' object has no attribute 'lower'
In [ ]:
I have tried this modification:
#definition for collecting keyword.
def word_in_text(word, text):
word = word.lower()
if type(text) is str:
return text.lower()
match = re.search(word, text)
if match:
return True
return False
to be used in:
tweets['taliban'] = tweets['text'].apply(lambda tweet: word_in_text('taliban', tweet))
But the error given is:
UnboundLocalError: local variable 'match' referenced before assignment

Related

Web Scraping: How do I return specific user input forms in python?

I'm having trouble with the forms returning an exact match for the user input.
Emphasoft developer challenge:
Taking a list of tax form names (ex: "Form W-2", "Form 1095-C"),
search the website and return some informational results.
Specifically, you must return the "Product Number", the "Title", and
the maximum and minimum years the form is available for download.
Taking a tax form name (ex: "Form W-2") and a range of years
(inclusive, 2018-2020 should fetch three years), download all PDFs
available within that range.
import json import os import sys import requests from bs4 import BeautifulSoup
URL = 'https://apps.irs.gov/app/picklist/list/priorFormPublication.html?resultsPerPage=200&sortColumn=sortOrder&indexOfFirstRow=0&{param.strip}&isDescending=false'
def get_forms(list_tax_form: list):
"""
function to get response from iris.gov with all forms content
:param list_tax_form: list of form names that we want to get info about
:return: dict with form name,form title
"""
response_list = [] # list for all responses of form names
with requests.session() as session:
for param in list_tax_form:
request_params = {'value': param,
'criteria': 'formNumber',
'submitSearch': 'Find',
}
res = session.get(URL, params=request_params).content
response_list.append(res)
return response_list
def parse_responses(list_tax_form: list):
"""
function to get all form names, titles years from previous func return
:param list_tax_form: list of form names that we want to get info about
:return: list of form names, titles, years
"""
responses = get_forms(list_tax_form)
# empty lists to fill them with the received information for all names, years, and titles
td_form_name, td_form_title, td_form_rev_year = [], [], []
for response in responses:
soup = BeautifulSoup(response, 'lxml')
td_name = soup.find_all('td', {'class': 'LeftCellSpacer'})
td_title = soup.find_all('td', {'class': 'MiddleCellSpacer'})
td_rev_year = soup.find_all('td', {'class': 'EndCellSpacer'})
td_form_name.extend(td_name)
td_form_title.extend(td_title)
td_form_rev_year.extend(td_rev_year)
return td_form_name, td_form_title, td_form_rev_year
def format_responses(list_tax_form: list):
"""
function to formate all responses for all forms we got!
1 Task
:param list_tax_form: list of form names that we want to get info about
:return: formated names,links,years
"""
td_names, td_titles, td_years = parse_responses(list_tax_form)
names = [name.text.strip() for name in td_names]
links = [link.find('a')['href'] for link in td_names]
titles = [title.text.strip() for title in td_titles]
years = [int(year.text.strip()) for year in td_years]
set_names = set(names)
final_dict = []
# loop to create dictionary of result information with years of tax form available to download
for name in set_names:
max_year = 0
min_year = max(years)
dict1 = {'form_number': name}
for index, p_name in enumerate(names):
if p_name == name:
if years[index] > max_year:
max_year = years[index]
elif years[index] < min_year:
min_year = years[index]
dict1['form_title'] = titles[index]
dict1['max_year'] = max_year
dict1['min_year'] = min_year
final_dict.append(dict1)
print(json.dumps(final_dict, indent=2))
return names, links, years
def download_files(list_tax_form):
"""
2 Task
Module to download pdf files of form_name that input from user.
:param list_tax_form: list of form names that we want to get info about
:return: message to user of successful create file or either
"""
names, links, years = format_responses(list_tax_form)
form_name = input('enter form name: ')
if form_name in names:
print('form exists. enter years range')
form_year1 = int(input('start year to analysis: '))
form_year2 = int(input('end year to analysis: '))
try:
os.mkdir(form_name)
except FileExistsError:
pass
# indecies to define names range in list of all tax form names
r_index = names.index(form_name) # index of first form_name mention on list
l_index = names.index(form_name) # index of last form_name mention on list
for name in names:
if name == form_name:
r_index += 1
years = years[l_index:r_index]
if form_year1 < form_year2:
range_years = range(form_year1, form_year2 + 1)
for year in range_years:
if year in years:
link = links[years.index(year)]
form_file = requests.get(link, allow_redirects=True)
open(f'{form_name}/{form_name}_{str(year)}.pdf', 'wb').write(form_file.content)
print(f'files saved to {form_name}/ directory!')
else:
print('input correct form name!')
if __name__ == '__main__':
tax_list = sys.argv[1:] # form names
download_files(tax_list)
(ex: "Form W-2" should not return "Form W-2 P")
When this file is ran, it is displaying other unrelated results.
How can I resolve this issue to display only specified user requests?

Download multiple 10-ks documents

I need to download multiple 10-ks documents, however, this code works fine if i download the 10-ks between 5-10 companies. But if i increase the number of companies in [cik_lookup function]. Here's code.
import nltk
import numpy as np
import pandas as pd
import pickle
import pprint
import project_helper
from tqdm import tqdm
Here's the py file that includes project_helper functions.
import matplotlib.pyplot as plt
import requests
from ratelimit import limits, sleep_and_retry
class SecAPI(object):
SEC_CALL_LIMIT = {'calls': 10, 'seconds': 1}
#staticmethod
#sleep_and_retry
# Dividing the call limit by half to avoid coming close to the limit
#limits(calls=SEC_CALL_LIMIT['calls'] / 2, period=SEC_CALL_LIMIT['seconds'])
def _call_sec(url):
return requests.get(url)
def get(self, url):
return self._call_sec(url).text
def print_ten_k_data(ten_k_data, fields, field_length_limit=50):
indentation = ' '
print('[')
for ten_k in ten_k_data:
print_statement = '{}{{'.format(indentation)
for field in fields:
value = str(ten_k[field])
# Show return lines in output
if isinstance(value, str):
value_str = '\'{}\''.format(value.replace('\n', '\\n'))
else:
value_str = str(value)
# Cut off the string if it gets too long
if len(value_str) > field_length_limit:
value_str = value_str[:field_length_limit] + '...'
print_statement += '\n{}{}: {}'.format(indentation * 2, field, value_str)
print_statement += '},'
print(print_statement)
print(']')
The first step it to download NLP Corpora.
nltk.download('stopwords')
nltk.download('wordnet')
Than Get 10ks
#cik_lookup = {
# 'GOOGL':'0001288776',
# 'AAPL':'0000320193',
# 'FACEBOOK':'0001326801',
# 'AMZN':'0001018724',
# 'MSFT':'0000789019'}
cik_lookup = {
'AEP': '0000004904',
'AXP': '0000004962',
'BA': '0000012927',
'BK': '0001390777',
'CAT': '0000018230',
'DE': '0000315189',
'DIS': '0001001039',
'DTE': '0000936340',
'ED': '0001047862',
'EMR': '0000032604',
'ETN': '0001551182',
'GE': '0000040545',
'IBM': '0000051143',
'IP': '0000051434',
'JNJ': '0000200406',
'KO': '0000021344',
'LLY': '0000059478',
'MCD': '0000063908',
'MO': '0000764180',
'MRK': '0000310158',
'MRO': '0000101778',
'PCG': '0001004980',
'PEP': '0000077476',
'PFE': '0000078003',
'PG': '0000080424',
'PNR': '0000077360',
'SYY': '0000096021',
'TXN': '0000097476',
'UTX': '0000101829',
'WFC': '0000072971',
'WMT': '0000104169',
'WY': '0000106535',
'XOM': '0000034088'}
Get list of 10-ks
sec_api = project_helper.SecAPI()
from bs4 import BeautifulSoup
def get_sec_data(cik, doc_type, start=0, count=60):
newest_pricing_data = pd.to_datetime('2021-01-01')
rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
'&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' \
.format(cik, doc_type, start, count)
sec_data = sec_api.get(rss_url)
feed = BeautifulSoup(sec_data.encode('utf-8'), 'xml').feed
entries = [
(
entry.content.find('filing-href').getText(),
entry.content.find('filing-type').getText(),
entry.content.find('filing-date').getText())
for entry in feed.find_all('entry', recursive=False)
if pd.to_datetime(entry.content.find('filing-date').getText()) <= newest_pricing_data]
return entries
example_ticker = 'AEP'
sec_data = {}
for ticker, cik in cik_lookup.items():
sec_data[ticker] = get_sec_data(cik, '10-K')
The code works fine if i download the 10-ks between 5-10 companies. But if i increase the number of companies in [cik_lookup function] I get the following error. The first error I got is as below.
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-8-28a784054794> in <module>()
20
21 for ticker, cik in cik_lookup.items():
---> 22 sec_data[ticker] = get_sec_data(cik, '10-K')
<ipython-input-8-28a784054794> in get_sec_data(cik, doc_type, start, count)
5 rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' '&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' .format(cik, doc_type, start, count)
6 sec_data = sec_api.get(rss_url)
----> 7 feed = BeautifulSoup(sec_data.encode('ascii'), 'xml').feed
8 entries = [
9 (
UnicodeEncodeError: 'ascii' codec can't encode characters in position 2599-2601: ordinal not in range(128)
However, after some google search over BeutifulSoup(ecodes) I changed it to utf-8 and then got the following error.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-9-9c77ed07af2d> in <module>()
20
21 for ticker, cik in cik_lookup.items():
---> 22 sec_data[ticker] = get_sec_data(cik, '10-K')
<ipython-input-9-9c77ed07af2d> in get_sec_data(cik, doc_type, start, count)
11 entry.content.find('filing-type').getText(),
12 entry.content.find('filing-date').getText())
---> 13 for entry in feed.find_all('entry', recursive=False)
14 if pd.to_datetime(entry.content.find('filing-date').getText()) <= newest_pricing_data]
15
AttributeError: 'NoneType' object has no attribute 'find_all'
The project can be accessed here at the following github repo.
github repo herealso.

tuple index out of range for printing a index value

While executing the following code i'm getting below error, Just for information matchObj here returns a tuple value ..
$ ./ftpParser3_re_dup.py
Traceback (most recent call last):
File "./ftpParser3_re_dup.py", line 13, in <module>
print("{0:<30}{1:<20}{2:<50}{3:<15}".format("FTP ACCOUNT","Account Type","Term Flag"))
IndexError: tuple index out of range
Code is below:
from __future__ import print_function
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE,SIG_DFL)
import re
with open('all_adta', 'r') as f:
for line in f:
line = line.strip()
data = f.read()
# Making description & termflag optional in the regex pattern as it's missing in the "data_test" file with several occurrences.
regex = (r"dn:(.*?)\nftpuser: (.*)\n(?:description:* (.*))?\n(?:termflag:* (.*))")
matchObj = re.findall(regex, data)
print("{0:<30}{1:<20}{2:<50}{3:<15}".format("FTP ACCOUNT","Account Type","Term Flag"))
print("{0:<30}{1:<20}{2:<50}{3:<15}".format("-----------","------------","--------"))
for index in matchObj:
index_str = ' '.join(index)
new_str = re.sub(r'[=,]', ' ', index_str)
new_str = new_str.split()
# In below print statement we are using "index[2]" as index is tuple here, this is because
# findall() returns the matches as a list, However with groups, it returns it as a list of tuples.
print("{0:<30}{1:<20}{2:<50}{3:<15}".format(new_str[1],new_str[8],index[2],index[3]))
In the line print("{0:<30}{1:<20}{2:<50}{3:<15}".format("FTP ACCOUNT","Account Type","Term Flag")) you have mentioned 4 indices but given only 3 i.e. "FTP ACCOUNT","Account Type","Term Flag"
Remove the 4th index or add a new one

IndexError: list index out of range, scores.append( (fields[0], fields[1]))

I'm trying to read a file and put contents in a list. I have done this mnay times before and it has worked but this time it throws back the error "list index out of range".
the code is:
with open("File.txt") as f:
scores = []
for line in f:
fields = line.split()
scores.append( (fields[0], fields[1]))
print(scores)
The text file is in the format;
Alpha:[0, 1]
Bravo:[0, 0]
Charlie:[60, 8, 901]
Foxtrot:[0]
I cant see why it is giving me this problem. Is it because I have more than one value for each item? Or is it the fact that I have a colon in my text file?
How can I get around this problem?
Thanks
If I understand you well this code will print you desired result:
import re
with open("File.txt") as f:
# Let's make dictionary for scores {name:scores}.
scores = {}
# Define regular expressin to parse team name and team scores from line.
patternScore = '\[([^\]]+)\]'
patternName = '(.*):'
for line in f:
# Find value for team name and its scores.
fields = re.search(patternScore, line).groups()[0].split(', ')
name = re.search(patternName, line).groups()[0]
# Update dictionary with new value.
scores[name] = fields
# Print output first goes first element of keyValue in dict then goes keyName
for key in scores:
print (scores[key][0] + ':' + key)
You will recieve following output:
60:Charlie
0:Alpha
0:Bravo
0:Foxtrot

Can openoffice count words from console?

i have a small problem i need to count words inside the console to read doc, docx, pptx, ppt, xls, xlsx, odt, pdf ... so don't suggest me | wc -w or grep because they work only with text or console output and they count only spaces and in japanese, chinese, arabic , hindu , hebrew they use diferent delimiter so the word count is wrong and i tried to count with this
pdftotext file.pdf -| wc -w
/usr/local/bin/docx2txt.pl < file.docx | wc -w
/usr/local/bin/pptx2txt.pl < file.pptx | wc -w
antiword file.doc -| wc -w
antiword file.word -| wc -w
in some cases microsoft word , openoffice sad 1000 words and the counters return 10 or 300 words if the language is ( japanese , chinese, hindu ect... ) , but if i use normal characters then i have no issue the biggest mistake is in some case 3 chars less witch is "OK"
i tried to convert with soffice , openoffice and then try WC -w but i can't even convert ,
soffice --headless --nofirststartwizard --accept=socket,host=127.0.0.1,port=8100; --convert-to pdf some.pdf /var/www/domains/vocabridge.com/devel/temp_files/23/0/东京_1000_words_Docx.docx
OR
openoffice.org --headless --convert-to ........
OR
openoffice.org3 --invisible
so if someone know any way to count correctly or display document statistic with openoffice or anything else or linux with the console please share it
thanks.
If you have Microsoft Word (and Windows, obviously) you can write a VBA macro or if you want to run straight from the command line you can write a VBScript script with something like the following:
wordApp = CreateObject("Word.Application")
doc = ... ' open up a Word document using wordApp
docWordCount = doc.Words.Count
' Rinse and repeat...
If you have OpenOffice.org/LibreOffice you have similar (but more) options. If you want to stay in the office app and run a macro you can probably do that. I don't know the StarBasic API well enough to tell you how but I can give you the alternative: creating a Python script to get the word count from the command line. Roughly speaking, you do the following:
Start up your copy of OOo/LibO from the command line with the appropriate parameters to accept incoming socket connections. http://www.openoffice.org/udk/python/python-bridge.html has instructions on how to do that. Go there and use the browser's in-page find feature to search for `accept=socket'
Write a Python script to use the OOo/LibO UNO bridge (basically equivalent to the VBScript example above) to open up your Word/ODT documents one at a time and get the word count from each. The above page should give you a good start to doing that.
You get the word count from a document model object's WordCount property: http://www.openoffice.org/api/docs/common/ref/com/sun/star/text/GenericTextDocument.html#WordCount
Just building on to what #Yawar wrote. Here is is more explicit steps for how to word count with MS word from the console.
I also use the more accurate Range.ComputeStatistics(wdStatisticWords) instead of the Words property. See here for more info: https://support.microsoft.com/en-za/help/291447/word-count-appears-inaccurate-when-you-use-the-vba-words-property
Make a script called wc.vbs and then put this in it:
Set word = CreateObject("Word.Application")
word.Visible = False
Set doc = word.Documents.Open("<replace with absolute path to your .docx/.pdf>")
docWordCount = doc.Range.ComputeStatistics(wdStatisticWords)
word.Quit
Dim StdOut : Set StdOut = CreateObject("Scripting.FileSystemObject").GetStandardStream(1)
WScript.Echo docWordCount & " words"
Open powershell in the directory you saved wc.vbs and run cscript .\wc.vbs and you'll get back the word count :)
I think this may do what you are aiming for
# Continuously updating word count
import unohelper, uno, os, time
from com.sun.star.i18n.WordType import WORD_COUNT
from com.sun.star.i18n import Boundary
from com.sun.star.lang import Locale
from com.sun.star.awt import XTopWindowListener
#socket = True
socket = False
localContext = uno.getComponentContext()
if socket:
resolver = localContext.ServiceManager.createInstanceWithContext('com.sun.star.bridge.UnoUrlResolver', localContext)
ctx = resolver.resolve('uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext')
else: ctx = localContext
smgr = ctx.ServiceManager
desktop = smgr.createInstanceWithContext('com.sun.star.frame.Desktop', ctx)
waittime = 1 # seconds
def getWordCountGoal():
doc = XSCRIPTCONTEXT.getDocument()
retval = 0
# Only if the field exists
if doc.getTextFieldMasters().hasByName('com.sun.star.text.FieldMaster.User.WordCountGoal'):
# Get the field
wordcountgoal = doc.getTextFieldMasters().getByName('com.sun.star.text.FieldMaster.User.WordCountGoal')
retval = wordcountgoal.Content
return retval
goal = getWordCountGoal()
def setWordCountGoal(goal):
doc = XSCRIPTCONTEXT.getDocument()
if doc.getTextFieldMasters().hasByName('com.sun.star.text.FieldMaster.User.WordCountGoal'):
wordcountgoal = doc.getTextFieldMasters().getByName('com.sun.star.text.FieldMaster.User.WordCountGoal')
wordcountgoal.Content = goal
# Refresh the field if inserted in the document from Insert > Fields >
# Other... > Variables > Userdefined fields
doc.TextFields.refresh()
def printOut(txt):
if socket: print txt
else:
model = desktop.getCurrentComponent()
text = model.Text
cursor = text.createTextCursorByRange(text.getEnd())
text.insertString(cursor, txt + '\r', 0)
def hotCount(st):
'''Counts the number of words in a string.
ARGUMENTS:
str st: count the number of words in this string
RETURNS:
int: the number of words in st'''
startpos = long()
nextwd = Boundary()
lc = Locale()
lc.Language = 'en'
numwords = 1
mystartpos = 1
brk = smgr.createInstanceWithContext('com.sun.star.i18n.BreakIterator', ctx)
nextwd = brk.nextWord(st, startpos, lc, WORD_COUNT)
while nextwd.startPos != nextwd.endPos:
numwords += 1
nw = nextwd.startPos
nextwd = brk.nextWord(st, nw, lc, WORD_COUNT)
return numwords
def updateCount(wordCountModel, percentModel):
'''Updates the GUI.
Updates the word count and the percentage completed in the GUI. If some
text of more than one word is selected (including in multiple selections by
holding down the Ctrl/Cmd key), it updates the GUI based on the selection;
if not, on the whole document.'''
model = desktop.getCurrentComponent()
try:
if not model.supportsService('com.sun.star.text.TextDocument'):
return
except AttributeError: return
sel = model.getCurrentSelection()
try: selcount = sel.getCount()
except AttributeError: return
if selcount == 1 and sel.getByIndex(0).getString == '':
selcount = 0
selwords = 0
for nsel in range(selcount):
thisrange = sel.getByIndex(nsel)
atext = thisrange.getString()
selwords += hotCount(atext)
if selwords > 1: wc = selwords
else:
try: wc = model.WordCount
except AttributeError: return
wordCountModel.Label = str(wc)
if goal != 0:
pc_text = 100 * (wc / float(goal))
#pc_text = '(%.2f percent)' % (100 * (wc / float(goal)))
percentModel.ProgressValue = pc_text
else:
percentModel.ProgressValue = 0
# This is the user interface bit. It looks more or less like this:
###############################
# Word Count _ o x #
###############################
# _____ #
# 451 / |500| #
# ----- #
# ___________________________ #
# |############## | #
# --------------------------- #
###############################
# The boxed `500' is the text entry box.
class WindowClosingListener(unohelper.Base, XTopWindowListener):
def __init__(self):
global keepGoing
keepGoing = True
def windowClosing(self, e):
global keepGoing
keepGoing = False
setWordCountGoal(goal)
e.Source.setVisible(False)
def addControl(controlType, dlgModel, x, y, width, height, label, name = None):
control = dlgModel.createInstance(controlType)
control.PositionX = x
control.PositionY = y
control.Width = width
control.Height = height
if controlType == 'com.sun.star.awt.UnoControlFixedTextModel':
control.Label = label
elif controlType == 'com.sun.star.awt.UnoControlEditModel':
control.Text = label
elif controlType == 'com.sun.star.awt.UnoControlProgressBarModel':
control.ProgressValue = label
if name:
control.Name = name
dlgModel.insertByName(name, control)
else:
control.Name = 'unnamed'
dlgModel.insertByName('unnamed', control)
return control
def loopTheLoop(goalModel, wordCountModel, percentModel):
global goal
while keepGoing:
try: goal = int(goalModel.Text)
except: goal = 0
updateCount(wordCountModel, percentModel)
time.sleep(waittime)
if not socket:
import threading
class UpdaterThread(threading.Thread):
def __init__(self, goalModel, wordCountModel, percentModel):
threading.Thread.__init__(self)
self.goalModel = goalModel
self.wordCountModel = wordCountModel
self.percentModel = percentModel
def run(self):
loopTheLoop(self.goalModel, self.wordCountModel, self.percentModel)
def wordCount(arg = None):
'''Displays a continuously updating word count.'''
dialogModel = smgr.createInstanceWithContext('com.sun.star.awt.UnoControlDialogModel', ctx)
dialogModel.PositionX = XSCRIPTCONTEXT.getDocument().CurrentController.Frame.ContainerWindow.PosSize.Width / 2.2 - 105
dialogModel.Width = 100
dialogModel.Height = 30
dialogModel.Title = 'Word Count'
lblWc = addControl('com.sun.star.awt.UnoControlFixedTextModel', dialogModel, 6, 2, 25, 14, '', 'lblWc')
lblWc.Align = 2 # Align right
addControl('com.sun.star.awt.UnoControlFixedTextModel', dialogModel, 33, 2, 10, 14, ' / ')
txtGoal = addControl('com.sun.star.awt.UnoControlEditModel', dialogModel, 45, 1, 25, 12, '', 'txtGoal')
txtGoal.Text = goal
#addControl('com.sun.star.awt.UnoControlFixedTextModel', dialogModel, 6, 25, 50, 14, '(percent)', 'lblPercent')
ProgressBar = addControl('com.sun.star.awt.UnoControlProgressBarModel', dialogModel, 6, 15, 88, 10,'' , 'lblPercent')
ProgressBar.ProgressValueMin = 0
ProgressBar.ProgressValueMax =100
#ProgressBar.Border = 2
#ProgressBar.BorderColor = 255
#ProgressBar.FillColor = 255
#ProgressBar.BackgroundColor = 255
addControl('com.sun.star.awt.UnoControlFixedTextModel', dialogModel, 124, 2, 12, 14, '', 'lblMinus')
controlContainer = smgr.createInstanceWithContext('com.sun.star.awt.UnoControlDialog', ctx)
controlContainer.setModel(dialogModel)
controlContainer.addTopWindowListener(WindowClosingListener())
controlContainer.setVisible(True)
goalModel = controlContainer.getControl('txtGoal').getModel()
wordCountModel = controlContainer.getControl('lblWc').getModel()
percentModel = controlContainer.getControl('lblPercent').getModel()
ProgressBar.ProgressValue = percentModel.ProgressValue
if socket:
loopTheLoop(goalModel, wordCountModel, percentModel)
else:
uthread = UpdaterThread(goalModel, wordCountModel, percentModel)
uthread.start()
keepGoing = True
if socket:
wordCount()
else:
g_exportedScripts = wordCount,
Link for more info
https://superuser.com/questions/529446/running-word-count-in-openoffice-writer
Hope this helps regards tom
EDIT : Then i found this
http://forum.openoffice.org/en/forum/viewtopic.php?f=7&t=22555
wc can understand Unicode and uses system's iswspace function to find whether the unicode character is whitespace. "The iswspace() function tests whether wc is a wide-character code representing a character of class space in the program's current locale." So, wc -w should be able to correctly count words if your locale (LC_CTYPE) is configured correctly.
The source code of the wc program
The manual page for the iswspace function
I found the answer create one service
#!/bin/sh
#
# chkconfig: 345 99 01
#
# description: your script is a test service
#
(while sleep 1; do
ls pathwithfiles/in | while read file; do
libreoffice --headless -convert-to pdf "pathwithfiles/in/$file" --outdir pathwithfiles/out
rm "pathwithfiles/in/$file"
done
done) &
then the php script that i needed counted everything
$ext = pathinfo($absolute_file_path, PATHINFO_EXTENSION);
if ($ext !== 'txt' && $ext !== 'pdf') {
// Convert to pdf
$tb = mktime() . mt_rand();
$tempfile = 'locationofpdfs/in/' . $tb . '.' . $ext;
copy($absolute_file_path, $tempfile);
$absolute_file_path = 'locationofpdfs/out/' . $tb . '.pdf';
$ext = 'pdf';
while (!is_file($absolute_file_path)) sleep(1);
}
if ($ext !== 'txt') {
// Convert to txt
$tempfile = tempnam(sys_get_temp_dir(), '');
shell_exec('pdftotext "' . $absolute_file_path . '" ' . $tempfile);
$absolute_file_path = $tempfile;
$ext = 'txt';
}
if ($ext === 'txt') {
$seq = '/[\s\.,;:!\? ]+/mu';
$plain = file_get_contents($absolute_file_path);
$plain = preg_replace('#\{{{.*?\}}}#su', "", $plain);
$str = preg_replace($seq, '', $plain);
$chars = count(preg_split('//u', $str, -1, PREG_SPLIT_NO_EMPTY));
$words = count(preg_split($seq, $plain, -1, PREG_SPLIT_NO_EMPTY));
if ($words === 0) return $chars;
if ($chars / $words > 10) $words = $chars;
return $words;
}

Resources