Apache Tika is indexing HTTP response instead of document content - wordpress

I'm using Solr 8.3 and Tika to index Wordpress (version 4.9.7) contents and attachments. Solr and Wordpress servers are in the same internal network in the company. Due to an organizational decision, I'm not using plugins such as WP-Solr and others (all of them good enough).
I wrote data-config.xml and managed-schema files, and uploaded them to Zookeeper. These files are updated in Solr admin interface. So I created a new collection, called wp, and indexed some files (in Solr admin interface, I set the range from 0 to 200).
So, when I query the contents, the meta fields are rightly indexed, but the conteudo_text and text fields return 301 HTTP Response (example bellow):
{
"responseHeader":{
"zkConnected":true,
"status":0,
"QTime":18,
"params":{
"q":"*:*",
"start":"0",
"rows":"1",
"_":"1580842143627"}},
"response":{"numFound":500,"start":0,"maxScore":1.0,"docs":[
{
"data_alteracao":"2019-09-06T11:05:10Z",
"conteudo":"Criação",
"titulo":"Criação",
"id":"37829",
"data_publicacao":"2019-09-06T11:04:55Z",
"url":"http://www.homolog.tjrs.jus.br/static/2019/09/estag-criacao.pdf",
"conteudo_text":["\nMoved Permanently\n\nThe document has moved here.\n\n\n\nApache Server at www.homolog.tjrs.jus.br Port 80\n\n"],
"_text_":["\nMoved Permanently\n\nThe document has moved here.\n\n\n\nApache Server at www.homolog.tjrs.jus.br Port 80\n\n"],
"_version_":1657631228775366656}]
}}
My data-config.xml:
<dataConfig>
<dataSource
type="JdbcDataSource"
driver="com.mysql.jdbc.Driver"
url="jdbc:mysql://mysql-grid-homol.tjrs.gov.br:3306/wordpress"
user="usr"
password="pwd"
name="wpdb"
batchSize="-1"
readOnly="true"
/>
<dataSource
type="BinURLDataSource"
name="url_doc"
/>
<document name="docs">
<entity
dataSource="wpdb"
name="wp"
pk="ID"
query="
SELECT
post.Id ID,
post_title TITULO,
IF (post_content = '', post_title, post_content) CONTEUDO,
CONCAT
(
DATE_FORMAT(post.Post_date, '%Y-%m-%d'),
'T',
DATE_FORMAT(post.Post_date, '%H:%i:%s'),
'Z'
) DATA_PUBLICACAO,
CONCAT
(
DATE_FORMAT(post.Post_modified, '%Y-%m-%d'),
'T',
DATE_FORMAT(post.Post_modified, '%H:%i:%s'),
'Z'
)DATA_ALTERACAO,
CONCAT
(
'http:',
guid
) URL
FROM
wpw_posts post
LEFT JOIN wpw_postmeta postmeta
ON (postmeta.Post_id = post.Id AND postmeta.Meta_key = 'publico')
WHERE
post.Post_type IN ('page', 'noticia', 'evento', 'curso', 'sistema', 'classificado', 'discurso', 'attachment')
AND post.post_status = 'inherit'
AND post.post_mime_type like 'application%'
ORDER BY post.Post_date DESC
"
>
<field column="ID" name="id"/>
<field column="TITULO" name="titulo"/>
<field column="CONTEUDO" name="conteudo"/>
<field column="DATA_PUBLICACAO" name="data_publicacao" dateTimeFormat="DD/MM/YYYY'T'hh:mm:ss"/>
<field column="DATA_ALTERACAO" name="data_alteracao" dateTimeFormat="DD/MM/YYYY'T'hh:mm:ss"/>
<field column="URL" name="url"/>
<entity
name="arquivo"
dataSource="url_doc"
processor="TikaEntityProcessor"
url="${wp.URL}"
format="text"
onError="continue"
extractEmbedded="true"
>
<field column="text" name="conteudo_text" />
</entity>
</entity>
</document>
</dataConfig>
My managed-schema:
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="v2" version="1.6">
<field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" />
<uniqueKey>id</uniqueKey>
<field name="titulo" type="string" indexed="true" stored="true" required="true" />
<field name="conteudo" type="string" indexed="true" stored="true" required="true" />
<field name="data_publicacao" type="date" indexed="true" stored="true" docValues="true"/>
<field name="data_alteracao" type="date" indexed="true" stored="true" docValues="true" />
<field name="url" type="string" indexed="true" stored="true" required="true" multiValued="false"/>
<field name="conteudo_text" type="text" indexed="true" stored="true" required="true" multiValued="true" default=" "/>
<field name="text" type="sem_aspas" indexed="true" stored="true" required="true" multiValued="true"/>
<field name="_version_" type="long" indexed="false" stored="false" />
<field name="_root_" type="string" indexed="true" stored="false" docValues="false" />
<field name="_text_" type="sem_aspas" indexed="true" stored="true" multiValued="true"/>
<!-- primitive types -->
<fieldType name="integer" class="solr.IntPointField" docValues="true"/>
<fieldType name="integers" class="solr.IntPointField" docValues="true" multiValued="true"/>
<fieldType name="long" class="solr.LongPointField" docValues="true"/>
<fieldType name="longs" class="solr.LongPointField" docValues="true" multiValued="true"/>
<fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true" />
<fieldType name="strings" class="solr.StrField" sortMissingLast="true" docValues="true" multiValued="true"/>
<fieldType name="date" class="solr.DatePointField" docValues="true"/>
<fieldType name="dates" class="solr.DatePointField" docValues="true" multiValued="true"/>
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
<fieldType name="float" class="solr.FloatPointField" docValues="true" multiValued="false"/>
<fieldType name="floats" class="solr.FloatPointField" docValues="true" multiValued="true"/>
<fieldType name="double" class="solr.DoublePointField" docValues="true" multiValued="false"/>
<fieldType name="doubles" class="solr.DoublePointField" docValues="true" multiValued="true"/>
<fieldType name="binary" class="solr.BinaryField"/>
<copyField source="conteudo_text" dest="_text_" />
<fieldType name="sem_aspas" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.ClassicTokenizerFactory"/>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-FoldToASCII.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" format="snowball" />
<filter class="solr.BrazilianStemFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.ClassicTokenizerFactory"/>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-FoldToASCII.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" format="snowball" />
<filter class="solr.BrazilianStemFilterFactory"/>
<filter class="solr.SynonymGraphFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
</analyzer>
</fieldType>
<fieldType name="text" class="solr.TextField" positionIncrementGap="100" multiValued="true">
<analyzer type="index">
<tokenizer class="solr.ClassicTokenizerFactory"/>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-FoldToASCII.txt"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.ClassicTokenizerFactory"/>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-FoldToASCII.txt"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
</schema>
Things I tried to solve the problem:
1) Change from BinURLDataSource to URLDataSource or FieldStreamDataSource;
2) Include, in BinURLDataSource definition, an user and password with permissions to access the files.
I'm a new user in Solr/Lucene and Tika technology (my 2nd project only), and any help is welcome.
Regards.

Related

SOLR: Document is missing mandatory uniqueKey field: id

I have just started with the solr-6.5.1, trying to import data from Oracle DB.
My data does not have any Unique ID, searched through various forums and found out that using class="solr.UUIDUpdateProcessorFactory" this can be resolved. but stil getting the same error. below is the snippet of my managed-schema and solrconfig.xml.
solrconfig.xml
<requestHandler name="/update" class="solr.UpdateRequestHandler">
<lst name="defaults">
<str name="update.chain">uuid</str>
</lst>
</requestHandler>
<updateRequestProcessorChain name="uuid">
<processor class="solr.UUIDUpdateProcessorFactory">
<str name="fieldName">id</str>
</processor>
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
managed-schema
<field name="id" type="uuid" indexed="true" stored="true" required="true" />
<field name="SERVICEACCOUNTNUMBER" type="string" indexed="true" stored="true" required="true" multiValued="false" />
<field name="USERNAME" type="string" indexed="true" stored="true" required="true" multiValued="false" />
<field name="NAME" type="string" indexed="true" stored="true" multiValued="false" />
<field name="TRANSACTION_DATE" type="date" indexed="true" stored="true" />
<field name="PROCESSSTATUS" type="string" indexed="true" stored="true" />
<field name="RECHARGE_MODE" type="string" indexed="true" stored="true" />
<field name="PROCESSRECHARGEREQUESTNO" type="string" indexed="true" stored="true" />
<field name="PACKAGE_ID" type="string" indexed="true" stored="true" />
<field name="PACKAGENAME" type="string" indexed="true" stored="true" />
<field name="AMOUNT" type="int" indexed="true" stored="true" multiValued="true" />
<field name="CREDITDOCUMENTNUMBER" type="string" indexed="true" stored="true" multiValued="true" />
<field name="DEBITDOCUMENTNUMBER" type="string" indexed="true" stored="true" />
<field name="SERVICE_TYPE" type="string" indexed="true" stored="true" multiValued="true" />
Please help
answer credits to post solr uuid with error document is missing mandatory uniquekey field id
updated the solrconfig.xml to
<requestHandler name="/dataimport" class="solr.DataImportHandler">
<lst name="defaults">
<str name="config">data-config.xml</str>
<str name="update.chain">uuid</str>
</lst>
</requestHandler>
<updateRequestProcessorChain name="uuid">
<processor class="solr.UUIDUpdateProcessorFactory">
<str name="fieldName">id</str>
</processor>
<processor class="solr.RunUpdateProcessorFactory"/>
</updateRequestProcessorChain>

Can't remove punctuation in Solr

I have a solr install to query content on a Drupal site. Many of the title fields have punctuation at the start of the string and so when I sort by title the punctuation appears top of the list.
I would like to get solr to ignore the the title when sorting by title but none of the solutions I have tried work.
I am fairly new to solr and so it may be something really simple that I am doing wrong... I don't really understand much of what is going on in the schema.xml file!
The title field is called label in solr and I have tried various methods in solr.PatternReplaceFilterFactory which do not work.
<field name="label" type="text" indexed="true" stored="true" termVectors="true" omitNorms="true"/>
<copyField source="label" dest="sort_label"/>
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<filter class="solr.PatternReplaceFilterFactory"
pattern="(^\p{Punct}+)" replacement="" replace="all"
/>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory"
protected="protwords.txt"
generateWordParts="1"
generateNumberParts="1"
catenateWords="1"
catenateNumbers="1"
catenateAll="0"
splitOnCaseChange="0"
preserveOriginal="1"/>
<filter class="solr.LengthFilterFactory" min="2" max="100" />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
…
</analyzer>
My query is
start=0&rows=25&q=education&fl=id%2Centity_id%2Centity_type%2Cbundle%2Cbundle_name%2Csort_label%2Css_language%2Cis_comment_count%2Cds_created%2Cds_changed%2Cscore%2Cpath%2Curl%2Cis_uid%2Ctos_name%2Czm_parent_entity%2Css_filemime%2Css_file_entity_title%2Css_file_entity_url&pf=content%5E2.0&&sort=sort_label%20asc
This is done with the WordDelimiterFilterFactory. Set generateWordParts=1. Add this filter to your
After modifying the schema.xml restart the server and re-index the data.
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory"
protected="protwords.txt"
generateWordParts="1"
generateNumberParts="1"
catenateWords="1"
catenateNumbers="1"
catenateAll="0"
splitOnCaseChange="0"
preserveOriginal="1"/>
<filter class="solr.LengthFilterFactory" min="2" max="100" />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>

How to search from database using solr

Solr Version : 5.0
So I am working on Solr for first time, and really not understand perfectly. Here what I did :-
I have created a core named - search
Then my schema.xml file has follwoing code :
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="simple" version="1.5">
<types>
<fieldtype name='string' class='solr.StrField' />
<fieldtype name='long' class='solr.TrieLongField' />
</types>
<fields>
<field name='id' type='int' required='true' indexed="true"/>
<field name='name' type='text' required='true' indexed="true"/>
</fields>
<uniqueKey>id</uniqueKey>
<defaultSearchField>fullText</defaultSearchField>
<solrQueryParser defaultOperator='OR' />
</schema>
solrconfig.xml :
<?xml version='1.0' encoding='UTF-8' ?>
<config>
<luceneMatchVersion>5.0.0</luceneMatchVersion>
<lib dir="../../../../dist/" regex="solr-dataimporthandler-.*\.jar" />
<requestHandler name="standard" class="solr.StandardRequestHandler" default='true' />
<requestHandler name="/update" class="solr.UpdateRequestHandler" />
<requestHandler name="/admin/" class="org.apache.solr.handler.admin.AdminHandlers" />
<requestHandler name="/dataimport" class="org.apache.solr.handler.dataimport.DataImportHandler">
<lst name="defaults">
<str name="config">db-data-config.xml</str>
</lst>
</requestHandler>
<admin>
<defaultQuery>*:*</defaultQuery>
</admin>
</config>
db-data-config.xml :
<dataConfig>
<dataSource type="JdbcDataSource"
driver="com.mysql.jdbc.Driver"
url="jdbc:mysql://localhost:3306/solr"
user="root"
password="" />
<document>
<entity name="users" query="select id,name from users;" />
</document>
</dataConfig>
I have created a database on PHPmyadmin please find below SG :
when I clicked query on solr panel then it shows empty why ?
Can anyone help me on this, as I am new to solr search. What I am doing wrong ?
I dont see a field named "fulltext" in schema.xml but why its defined as the default search
<defaultSearchField>fullText</defaultSearchField>
change it
<defaultSearchField>name</defaultSearchField>
mention the fields in the data config xml
<field column="ID" name="id" />
<field column="NAME" name="name" />
your data-config should look alike
<dataConfig>
<dataSource type="JdbcDataSource"
driver="com.mysql.jdbc.Driver"
url="jdbc:mysql://localhost:3306/solr"
user="root"
password="" />
<document>
<entity name="users" query="select id,name from users">
<field column="ID" name="id" />
<field column="NAME" name="name" />
</entity>
</document>
</dataConfig>
add it as in schema.xml
<types>
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
</types>
<fields>
<field name='id' type='int' required='true' indexed="true" stored="true"/>
<field name='name' type='string' required='true' indexed="true" stored="true"/>
<fields>
Make the changes in your db-data-config.xml similar to what i have done
<entity name="city_masters" pk="city_id" query="SELECT delete_status as
city_masters_delete_status,city_id,country_id,city_name,city_updated from
city_masters>
<field column="city_id" name="id"/>
<field column="city_name" name="city_name" indexed="true" stored="true" />
<field column="country_id" name="country_id" indexed="true" stored="true" />
<field column="city_masters_delete_status" name="city_masters_delete_status"
indexed="true" stored="true" />
</entity>
You missed out the field column part.Add them like i have done for my code and it should work.If still doesnt work let me know

doctrine2 (xml): in a one-to-many bidirectional relationship, can't get the infers side?

i have Banner and Group entities with xml configs
Banner.xml
<mapped-superclass name="Banner" table="luc_banners"
repository-class="BannerRepository">
<id name="id" column="id" type="integer">
<generator strategy="AUTO" />
</id>
<field name="path" column="path" type="string" nullable="true" />
<field name="link" column="link" type="string" nullable="true" />
<field name="position" column="position" type="string" nullable="true" />
<field name="groupId" column="group_id" type="integer" />
<many-to-one field="group" target-entity="Group" inversed-by="banners">
<join-column name="group_id" referenced-column-name="id" nullable="false" on-delete="CASCADE" />
</many-to-one>
</mapped-superclass>
Group.xml
<mapped-superclass name="Group" table="luc_banners_groups" repository-class="GroupRepository">
<id name="id" column="id" type="integer">
<generator strategy="AUTO" />
</id>
<field name="name" column="group_name" type="string" nullable="true" />
<field name="type" column="group_type" type="string" nullable="true" />
<field name="status" column="group_status" type="string" nullable="true" />
<field name="order" column="group_order" type="integer" nullable="true" />
<one-to-many field="banners" target-entity="Banners" mapped-by="group">
<cascade>
<cascade-all />
</cascade>
</one-to-many>
</mapped-superclass>
when trying to get banners form group object i get empty array, and Profiler shows this sql SELECT l0_.group_name AS group_name0, l0_.id AS id4 FROM luc_banners_groups l0_ WHERE l0_.group_status = ? and show not valid entity with this error Banner - The association Banner#group refers to the inverse side field Group#banners which does not exist.
Can you help me with this thing ?
remove <field name="groupId" column="group_id" type="integer" /> from Banner.xml, I think it's overriding the second one..

Doctrine 2 unknown column type requested

I'm trying to update my doctrine schema with the command:
php app/console doctrine:schema:update --force
I'm getting this error:
[Doctrine\DBAL\DBALException]
Unknown column type requested.
I'm getting this error since I've updated the xml mapping of the user entity in the Sonata UserBundle like this:
<?xml version="1.0" encoding="UTF-8"?>
<doctrine-mapping xmlns="http://doctrine-project.org/schemas/orm/doctrine-mapping"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://doctrine-project.org/schemas/orm/doctrine-mapping
http://doctrine-project.org/schemas/orm/doctrine-mapping.xsd">
<entity name="Application\Sonata\UserBundle\Entity\User" table="fos_user_user" repository-class="Application\Sonata\UserBundle\Repository\UserRepository">
<id name="id" column="id" type="integer">
<generator strategy="AUTO" />
</id>
<field name="name" type="string" length="50" />
<field name="birthdate" type="date" />
<field name="natRanking" type="string" length="10" />
<field name="interNatRanking" type="string" length="10" nullable="true" />
<field name="natDoublesRanking" type="string" length="10" />
<field name="interNatDoublesRanking" type="string" length="10" nullable="true" />
<field name="doublesPartner" type="string" length="50" nullable="true" />
<field name="nationality" type="string" length="50" />
<field name="fileName" type="string" length="255" nullable="true" />
<field name="path" type="string" length="255" nullable="true" />
<field name="file" />
<many-to-many field="teams" target-entity="Tennisconnect\DashboardBundle\Entity\Team" mapped-by="players">
<join-table name="team_user">
<join-columns>
<join-column name="team_id" referenced-column-name="id"/>
</join-columns>
<inverse-join-columns>
<join-column name="user_id" referenced-column-name="id"/>
</inverse-join-columns>
</join-table>
</many-to-many>
<one-to-many field="my_friends" target-entity="Friend" mapped-by="friends_of_mine" />
<one-to-many field="friended_me" target-entity="Friend" mapped-by="friends_with_me" />
</entity>
Is the type of the field "file" missing?

Resources