grouping tokens with same first element value - pyparsing

given a nested list:
(layer 0
(dielectric FR4)
(thickness 0.005)
)
(layer 1
(dielectric copper)
(thickness 0.01)
)
(layer 2
(dielectric FR4)
(thickness 0.005)
)
(physicalLayerGroup "TOP"
(minLineWidth ALWAYS_CHECK)
(maxLineWidth ALWAYS_CHECK)
(maxBondingWireLength NEVER_CHECK)
(minBondingWireLength NEVER_CHECK)
)
(physicalLayerGroup "L02"
(minLineWidth ALWAYS_CHECK)
(maxLineWidth ALWAYS_CHECK)
(maxBondingWireLength NEVER_CHECK)
(minBondingWireLength NEVER_CHECK)
)
(physicalLayerGroup "L03"
(minLineWidth ALWAYS_CHECK)
(maxLineWidth ALWAYS_CHECK)
(maxBondingWireLength NEVER_CHECK)
(minBondingWireLength NEVER_CHECK)
)
ie a list of lists where the first element of each token is identical, how can I group them? Sounds like addCondition() would fit the bill but how would one code it? Something like the following:
pp.OneOrMore( pp.Word.setName('keyy'), pp.Word.setName('valu'), pp.OneOrMore( pp.ParseResults() ).setParseAction( foo).addCondition( keyy is constant)
Any ideas?
code_warrior

The easiest approach, assuming that you know the names of each group, is to match for each group in turn by the expected names. Using a helper function (which I define below as outer_group) makes this easier:
text = """
(layer 0
(dielectric FR4)
(thickness 0.005)
)
... etc. ...
)"""
import pyparsing as pp
LPAR, RPAR = map(pp.Suppress, "()")
key_expr = pp.Word(pp.alphas)
qs = pp.quotedString().addParseAction(pp.removeQuotes)
val_expr = pp.pyparsing_common.number | qs | pp.Word(pp.printables, excludeChars='()')
# Dict keys need to be converted to str's
subgroup_key = val_expr().addParseAction(lambda t: str(t[0]))
subgroup = pp.Dict(pp.OneOrMore(pp.Group(LPAR + key_expr + val_expr + RPAR)))
# use this method to define each group by name
def outer_group(group_name):
lookahead = pp.FollowedBy(LPAR + pp.Suppress(group_name))
return lookahead + pp.Dict(pp.ZeroOrMore(pp.Group(LPAR
+ pp.Suppress(group_name)
+ subgroup_key
+ subgroup
+ RPAR)))(group_name)
# define groups by name as given in the posted question
# '&' operator generates an Each expression, which allows the groups to be
# parsed in any order
parser = outer_group('layer') & outer_group('physicalLayerGroup')
# how it works
data = parser.parseString(text)
print(data.dump())
print(data.layer['0'].dielectric)
print(data.physicalLayerGroup.TOP.maxLineWidth)
prints:
[[['0', ['dielectric', 'FR4'], ['thickness', 0.005]], ['1', ['dielectric', 'copper'], ['thickness', 0.01]], ['2', ['dielectric', 'FR4'], ['thickness', 0.005]]], [['TOP', ['minLineWidth', 'ALWAYS_CHECK'], ['maxLineWidth', 'ALWAYS_CHECK'], ['maxBondingWireLength', 'NEVER_CHECK'], ['minBondingWireLength', 'NEVER_CHECK']], ['L02', ['minLineWidth', 'ALWAYS_CHECK'], ['maxLineWidth', 'ALWAYS_CHECK'], ['maxBondingWireLength', 'NEVER_CHECK'], ['minBondingWireLength', 'NEVER_CHECK']], ['L03', ['minLineWidth', 'ALWAYS_CHECK'], ['maxLineWidth', 'ALWAYS_CHECK'], ['maxBondingWireLength', 'NEVER_CHECK'], ['minBondingWireLength', 'NEVER_CHECK']]]]
- layer: [['0', ['dielectric', 'FR4'], ['thickness', 0.005]], ['1', ['dielectric', 'copper'], ['thickness', 0.01]], ['2', ['dielectric', 'FR4'], ['thickness', 0.005]]]
- 0: [['dielectric', 'FR4'], ['thickness', 0.005]]
- dielectric: 'FR4'
- thickness: 0.005
- 1: [['dielectric', 'copper'], ['thickness', 0.01]]
- dielectric: 'copper'
- thickness: 0.01
- 2: [['dielectric', 'FR4'], ['thickness', 0.005]]
- dielectric: 'FR4'
- thickness: 0.005
- physicalLayerGroup: [['TOP', ['minLineWidth', 'ALWAYS_CHECK'], ['maxLineWidth', 'ALWAYS_CHECK'], ['maxBondingWireLength', 'NEVER_CHECK'], ['minBondingWireLength', 'NEVER_CHECK']], ['L02', ['minLineWidth', 'ALWAYS_CHECK'], ['maxLineWidth', 'ALWAYS_CHECK'], ['maxBondingWireLength', 'NEVER_CHECK'], ['minBondingWireLength', 'NEVER_CHECK']], ['L03', ['minLineWidth', 'ALWAYS_CHECK'], ['maxLineWidth', 'ALWAYS_CHECK'], ['maxBondingWireLength', 'NEVER_CHECK'], ['minBondingWireLength', 'NEVER_CHECK']]]
- L02: [['minLineWidth', 'ALWAYS_CHECK'], ['maxLineWidth', 'ALWAYS_CHECK'], ['maxBondingWireLength', 'NEVER_CHECK'], ['minBondingWireLength', 'NEVER_CHECK']]
- maxBondingWireLength: 'NEVER_CHECK'
- maxLineWidth: 'ALWAYS_CHECK'
- minBondingWireLength: 'NEVER_CHECK'
- minLineWidth: 'ALWAYS_CHECK'
- L03: [['minLineWidth', 'ALWAYS_CHECK'], ['maxLineWidth', 'ALWAYS_CHECK'], ['maxBondingWireLength', 'NEVER_CHECK'], ['minBondingWireLength', 'NEVER_CHECK']]
- maxBondingWireLength: 'NEVER_CHECK'
- maxLineWidth: 'ALWAYS_CHECK'
- minBondingWireLength: 'NEVER_CHECK'
- minLineWidth: 'ALWAYS_CHECK'
- TOP: [['minLineWidth', 'ALWAYS_CHECK'], ['maxLineWidth', 'ALWAYS_CHECK'], ['maxBondingWireLength', 'NEVER_CHECK'], ['minBondingWireLength', 'NEVER_CHECK']]
- maxBondingWireLength: 'NEVER_CHECK'
- maxLineWidth: 'ALWAYS_CHECK'
- minBondingWireLength: 'NEVER_CHECK'
- minLineWidth: 'ALWAYS_CHECK'
FR4
ALWAYS_CHECK
Convert parsed structure to JSON by calling asDict()
# emit JSON from parsed results
import json
print(json.dumps(data.asDict(), indent=' '))
prints:
{
"physicalLayerGroup": {
"TOP": {
"minBondingWireLength": "NEVER_CHECK",
"minLineWidth": "ALWAYS_CHECK",
"maxBondingWireLength": "NEVER_CHECK",
"maxLineWidth": "ALWAYS_CHECK"
},
"L02": {
"minBondingWireLength": "NEVER_CHECK",
"minLineWidth": "ALWAYS_CHECK",
"maxBondingWireLength": "NEVER_CHECK",
"maxLineWidth": "ALWAYS_CHECK"
},
"L03": {
"minBondingWireLength": "NEVER_CHECK",
"minLineWidth": "ALWAYS_CHECK",
"maxBondingWireLength": "NEVER_CHECK",
"maxLineWidth": "ALWAYS_CHECK"
}
},
"layer": {
"0": {
"dielectric": "FR4",
"thickness": 0.005
},
"1": {
"dielectric": "copper",
"thickness": 0.01
},
"2": {
"dielectric": "FR4",
"thickness": 0.005
}
}
}

Related

Scrapy: Limiting the number of next page that are scraped. Unfortunately, the DEPTH_LIMIT custom setting doesn't work

I have build a simple amazon scraper to download listings of the products. However, I am not sure how I can limit the number of next pages that are crawled. Ideally, I don't want the spider to crawl more than 10 pages for each main page that it starts with. Some of the URLs in fact only have 2 pages.
Here is my code:
import scrapy
from scrapy.crawler import CrawlerProcess
from scraper_api import ScraperAPIClient
#Error Management Modules
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError
from .datatransformation import ErrorFileManagement
# Importing all defined attributes and items to be scraped!
from ..items import AmazonListingItems
from ..attributes import *
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.sql.expression import column
class AmazonListings(scrapy.Spider):
name = "amazonlistings"
def start_requests(self):
error = ErrorManager()
client = ScraperAPIClient('50903e1bf8db5418a25334f3e3ed7c74')
db = create_engine('postgresql://postgres:Maisha123#localhost:5432')
urls = db.execute('select category_url from scrapycategory')
df = pd.DataFrame(urls.fetchall())
urls = df.values.tolist()
for url in urls:
yield scrapy.Request(client.scrapyGet(url=url[0]), callback=self.parse, errback=error.error_handler, dont_filter=True)
custom_settings = {
'DEPTH_LIMIT' : 3,
'DOWNLOAD_DELAYED': 5
}
def parse(self, response):
items = AmazonListingItems()
ap = AttributeParser()
error = ErrorManager()
client = ScraperAPIClient('50903e1bf8db5418a25334f3e3ed7c74')
itemlist = ap.itemlist(response)
if itemlist:
for item in itemlist:
items['mainurl'] = response.url
items['producturl'] = ap.producturl(item)
items['productname'] = ap.productname(item)
items['price'] = ap.price(item)
items['ratings'] = ap.ratings(item)
items['reviews'] = ap.reviews(item)
items['heroimg'] = ap.heroimg(item)
items['badge'] = ap.badge(item)
yield items
next_page = ap.next_page(response)
if next_page:
dom = 'www.amazon.com'
if dom in next_page:
request = scrapy.Request(client.scrapyGet(next_page), callback=self.parse,errback=error.error_handler)
yield request
else:
next_page_url = 'https://www.amazon.com' + next_page
request = scrapy.Request(client.scrapyGet(next_page_url), callback=self.parse,errback=error.error_handler)
yield request
else:
error.error_handler(response, itemlist=False)
#All Attribute Parser
class AttributeParser:
def itemlist(self, response):
itemlist = []
itemlist.append(response.css('.zg-item'))
itemlist.append(response.css('.s-asin .sg-col-inner'))
if itemlist:
for item in itemlist:
if item:
return item
def producturl(self, response):
for urls in AmazonListing_producturl:
value = response.css(urls).extract()
if value:
return value
def productname(self, response):
for productname in AmazonListing_productname:
value = response.css(productname).extract()
if value:
return value
def price(self, response):
for price in AmazonListing_price:
value = response.css(price).extract()
if value:
return value
def ratings(self, response):
for ratings in AmazonListing_ratings:
value = response.css(ratings).extract()
if value:
return value
def reviews(self, response):
for reviews in AmazonListing_reviews:
value = response.css(reviews).extract()
if value:
return value
def heroimg(self, response):
for heroimg in AmazonListing_heroimg:
value = response.css(heroimg).extract()
if value:
return value
def badge(self, response):
for badge in AmazonListing_badge:
value = response.css(badge).extract()
if value:
return value
def next_page(self,response):
for nxtpg in AmazonListing_nextpage:
value = response.css(nxtpg).get()
if value:
return value
else:
return None
class ErrorManager:
def error_handler(self, failure, itemlist=True):
er = ErrorFileManagement()
if itemlist == False:
response = failure
failure_record = {
'request_url': response.url,
'request_url': response.request.url,
'status': response.status,
'ip_address': response.ip_address,
'headers': response.headers,
'response.body': response.body,
}
er.addError(failure_record)
elif failure.check(HttpError):
response = failure.value.response
failure_record = {
'request_url': response.request.url,
'response_url': response.url,
'status': response.status,
'ip_address': response.ip_address,
'headers': response.headers,
'response_body': response.body,
}
er.addError(failure_record)
elif failure.check(DNSLookupError):
response = failure.request
failure_record = {
'request_url': response.request.url,
'response_url': response.url,
'status': response.status,
'ip_address': response.ip_address,
'headers': response.headers,
'response_body': response.body,
}
er.addError(failure)
elif failure.check(TimeoutError, TCPTimedOutError):
response = failure.request
failure_record = {
'request_url': response.request.url,
'response_url': response.url,
'status': response.status,
'ip_address': response.ip_address,
'headers': response.headers,
'response_body': response.body,
}
er.addError(failure_record)
elif failure.status == 200:
response = failure
failure_record = {
'request_url': response.request.url,
'response_url': response.url,
'status': response.status,
'ip_address': response.ip_address,
'headers': response.headers,
'response_body': response.body,
}
er.addError(failure_record)
else:
response = failure
failure_record = {
'request_url': response.request.url,
'response_url': response.url,
'status': response.status,
'ip_address': response.ip_address,
'headers': response.headers,
'response_body': response.body,
}
er.addError(failure_record)
process = CrawlerProcess(settings={
'FEEDS': {
'/mnt/d/dev/dsiqscraper/amzlistings.csv': {'format':'csv'},
},
})
process.crawl(AmazonListings)
process.start()
custom_settings supposed to be a class attribute.
Like this:
class AmazonListings(scrapy.Spider):
name = "amazonlistings"
custom_settings = {
'DEPTH_LIMIT' : 3,
'DOWNLOAD_DELAYED': 5
}
def start_requests(self):
error = ErrorManager()
client = ScraperAPIClient('50903e1bf8db5418a25334f3e3ed7c74')
db = create_engine('postgresql://postgres:Maisha123#localhost:5432')
urls = db.execute('select category_url from scrapycategory')
df = pd.DataFrame(urls.fetchall())
urls = df.values.tolist()
for url in urls:
yield scrapy.Request(client.scrapyGet(url=url[0]), callback=self.parse, errback=error.error_handler, dont_filter=True)
def parse...........

Extract value from key value pairs and convert to CSV

I have below data set,
data:[{'name': 'cable', 'status': 'none'}, {'name': 'laptop', 'status': 'loaded', 'mode': 'high'}
{'name': 'samsung', 'status': 'none'}], location:[{'place': 'chennai', 'distance': '100km'},
{'place': 'bangalore', 'distance': '200km'}]
Am trying to extract the values and convert it into CSV. Am facing issues while converting as its multidimensional array. Any suggestion would be helpful.
If my data is just {'name': 'cable', 'status': 'none'}, {'name': 'laptop', 'status': 'loaded', 'mode': 'high'}, am able to get it through awk using below,
awk -F " = " -v OFS="," '
BEGIN { print "name","status","mode","place","distance" }
function printline() {
print data["name"], data["status"], data["mode"]
}
{data[$1] = $2}
NF == 0 {printline(); delete data}
END {printline()}
'
But am unable to get it with my original data set,
Original data,
data:[{'name': 'cable', 'status': 'none'}, {'name': 'laptop', 'status': 'loaded', 'mode': 'high'}
{'name': 'samsung', 'status': 'none'}], location:[{'place': 'chennai', 'distance': '100km'},
{'place': 'bangalore', 'distance': '200km'}]
Expected result,
name status mode place distance
cable none null chennai 100km
laptop loaded high bangalore 200km
samsung none null null null
Here's a start with a step-by-step approach using any awk in any shell on all UNIX boxes:
$ cat tst.awk
{ rec = (NR>1 ? rec " " : "") $0 }
END {
# Identify from rec:
# 1) [{'name': 'cable', 'status': 'none'}, {'name': 'laptop', 'status': 'loaded', 'mode': 'high'} {'name': 'samsung', 'status': 'none'}]
# 2) [{'place': 'chennai', 'distance': '100km'}, {'place': 'bangalore', 'distance': '200km'}]
str = rec
while ( match(str,/\[[^]]+/) ) {
val = substr(str,RSTART+1,RLENGTH-1)
level1vals[++numLevel1vals] = val
str = substr(str,RSTART+RLENGTH)
}
for (level1valNr=1; level1valNr<=numLevel1vals; level1valNr++) {
level1val = level1vals[level1valNr]
# Identify from level1vals[1]:
# 1) 'name': 'cable', 'status': 'none'
# 2) 'name': 'laptop', 'status': 'loaded', 'mode': 'high'
# 3) 'name': 'samsung', 'status': 'none'
# and from level1vals[2]:
# 4) 'place': 'chennai', 'distance': '100km'
# 5) 'place': 'bangalore', 'distance': '200km'
level2valNr = 0
str = level1val
while ( match(str,/{[^}]+/) ) {
val = substr(str,RSTART+1,RLENGTH-1)
++level2valNr
level2vals[level2valNr] = level2vals[level2valNr] " " val
numLevel2vals = (level2valNr > numLevel2vals ? level2valNr : numLevel2vals)
str = substr(str,RSTART+RLENGTH)
}
}
# NOTE: delete these print loops when done testing/debugging
for (level1valNr=1; level1valNr<=numLevel1vals; level1valNr++) {
print "level1vals[" level1valNr "] = <" level1vals[level1valNr] ">"
}
print ""
for (level2valNr=1; level2valNr<=numLevel2vals; level2valNr++) {
print "level2vals[" level2valNr "] = <" level2vals[level2valNr] ">"
}
}
.
$ awk -f tst.awk file
level1vals[1] = <{'name': 'cable', 'status': 'none'}, {'name': 'laptop', 'status': 'loaded', 'mode': 'high'} {'name': 'samsung', 'status': 'none'}>
level1vals[2] = <{'place': 'chennai', 'distance': '100km'}, {'place': 'bangalore', 'distance': '200km'}>
level2vals[1] = < 'name': 'cable', 'status': 'none' 'place': 'chennai', 'distance': '100km'>
level2vals[2] = < 'name': 'laptop', 'status': 'loaded', 'mode': 'high' 'place': 'bangalore', 'distance': '200km'>
level2vals[3] = < 'name': 'samsung', 'status': 'none'>
Add another round of looping using match($0,/\047[^\047]+/) to identify each 'foo' string, store in an array and then loop through that final array in the appropriate order to print the CSV.
Here is a bash/perl script to transform the original data to the "expected result" format. To make it produce result in CSV format, just change $DLMTR="\t" to $DLMTR=",":
% cat data.txt
data:[{'name': 'cable', 'status': 'none'}, {'name': 'laptop', 'status': 'loaded', 'mode': 'high'}
{'name': 'samsung', 'status': 'none'}], location:[{'place': 'chennai', 'distance': '100km'},
{'place': 'bangalore', 'distance': '200km'}]
% cat transform_data.sh
#!/usr/bin/bash
cat $* | tr "," "\n" | perl -lne '
BEGIN {
$i=$j=$data=$location=0;
# Change $DLMTR (delimiter) from "\t" (Tab) to "," for CSV format
$DLMTR="\t"
}
if (/data:/) {$data=1};
if (/location:/) {$location = 1; $data = 0;};
if ($data) { # process elements within data:[]
# \047 = single-quote and change to \042 if double-quote is required
$i++ if /\{/;
/\047name\047:/ && do { $name[$i]=$status[$i]=$mode[$i]=$place[$i]=$distance[$i]="null";
($name[$i])=/:\s*\047(.+?)\047/};
/\047status\047:/ && do {($status[$i])=/:\s*\047(.+?)\047/};
/\047mode\047:/ && do {($mode[$i])=/:\s*\047(.+?)\047/};
}
elsif ($location) { # process elements within location:[]
$j++ if /\{/;
/\047place\047:/ && do {($place[$j])=/:\s*\047(.+?)\047/};
/\047distance\047:/ && do {($distance[$j])=/:\s*\047(.+?)\047/;};
}
END {
print "name${DLMTR}status${DLMTR}mode${DLMTR}place${DLMTR}distance";
foreach $n (1..$i) {
print "$name[$n]${DLMTR}$status[$n]${DLMTR}$mode[$n]${DLMTR}$place[$n]${DLMTR}$distance[$n]";
}}'
% transform_data.sh data.txt
name status mode place distance
cable none null chennai 100km
laptop loaded high bangalore 200km
samsung none null null null

How to set locale locally for moment.js

The below code gives invalid date
moment.locale('en');
var localLocale = moment('enero 22, 2017', 'MMMM DD, YYYY');
localLocale.locale('es');
alert(localLocale.format('L'));'
You will need to import that particular locale js from Moment.js repo.
I've imported this in below code and this code works perfectly:
https://raw.githubusercontent.com/moment/moment/develop/locale/es.js
//! moment.js locale configuration
//! locale : Spanish [es]
//! author : Julio Napurí : https://github.com/julionc
;(function (global, factory) {
typeof exports === 'object' && typeof module !== 'undefined'
&& typeof require === 'function' ? factory(require('../moment')) :
typeof define === 'function' && define.amd ? define(['../moment'], factory) :
factory(global.moment)
}(this, (function (moment) { 'use strict';
var monthsShortDot = 'ene._feb._mar._abr._may._jun._jul._ago._sep._oct._nov._dic.'.split('_');
var monthsShort = 'ene_feb_mar_abr_may_jun_jul_ago_sep_oct_nov_dic'.split('_');
var monthsParse = [/^ene/i, /^feb/i, /^mar/i, /^abr/i, /^may/i, /^jun/i, /^jul/i, /^ago/i, /^sep/i, /^oct/i, /^nov/i, /^dic/i];
var monthsRegex = /^(enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|ene\.?|feb\.?|mar\.?|abr\.?|may\.?|jun\.?|jul\.?|ago\.?|sep\.?|oct\.?|nov\.?|dic\.?)/i;
var es = moment.defineLocale('es', {
months : 'enero_febrero_marzo_abril_mayo_junio_julio_agosto_septiembre_octubre_noviembre_diciembre'.split('_'),
monthsShort : function (m, format) {
if (!m) {
return monthsShortDot;
} else if (/-MMM-/.test(format)) {
return monthsShort[m.month()];
} else {
return monthsShortDot[m.month()];
}
},
monthsRegex : monthsRegex,
monthsShortRegex : monthsRegex,
monthsStrictRegex : /^(enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre)/i,
monthsShortStrictRegex : /^(ene\.?|feb\.?|mar\.?|abr\.?|may\.?|jun\.?|jul\.?|ago\.?|sep\.?|oct\.?|nov\.?|dic\.?)/i,
monthsParse : monthsParse,
longMonthsParse : monthsParse,
shortMonthsParse : monthsParse,
weekdays : 'domingo_lunes_martes_miércoles_jueves_viernes_sábado'.split('_'),
weekdaysShort : 'dom._lun._mar._mié._jue._vie._sáb.'.split('_'),
weekdaysMin : 'do_lu_ma_mi_ju_vi_sá'.split('_'),
weekdaysParseExact : true,
longDateFormat : {
LT : 'H:mm',
LTS : 'H:mm:ss',
L : 'DD/MM/YYYY',
LL : 'D [de] MMMM [de] YYYY',
LLL : 'D [de] MMMM [de] YYYY H:mm',
LLLL : 'dddd, D [de] MMMM [de] YYYY H:mm'
},
calendar : {
sameDay : function () {
return '[hoy a la' + ((this.hours() !== 1) ? 's' : '') + '] LT';
},
nextDay : function () {
return '[mañana a la' + ((this.hours() !== 1) ? 's' : '') + '] LT';
},
nextWeek : function () {
return 'dddd [a la' + ((this.hours() !== 1) ? 's' : '') + '] LT';
},
lastDay : function () {
return '[ayer a la' + ((this.hours() !== 1) ? 's' : '') + '] LT';
},
lastWeek : function () {
return '[el] dddd [pasado a la' + ((this.hours() !== 1) ? 's' : '') + '] LT';
},
sameElse : 'L'
},
relativeTime : {
future : 'en %s',
past : 'hace %s',
s : 'unos segundos',
m : 'un minuto',
mm : '%d minutos',
h : 'una hora',
hh : '%d horas',
d : 'un día',
dd : '%d días',
M : 'un mes',
MM : '%d meses',
y : 'un año',
yy : '%d años'
},
dayOfMonthOrdinalParse : /\d{1,2}º/,
ordinal : '%dº',
week : {
dow : 1, // Monday is the first day of the week.
doy : 4 // The week that contains Jan 4th is the first week of the year.
}
});
return es;
})));
Your Code:
moment.updateLocale('es');
var localLocale = moment('enero 22, 2017', 'MMMM DD, YYYY');
alert(localLocale.format('L'));
Demo:
http://jsfiddle.net/6df2xf80/

How to count orders with positions unwind in MongoDB aggregation?

I have a collection with order headers and positions (as array) and I need an query which should give me:
quantity of customers
quantity of orders
summed up order value
all grouped by date and order type. I already got this covered by two queries (see below), but I want to have it in one.
The main problem to me is that I need to count the orders but with positions unwinded.
E.g.: Below would be a possible result of the combined query with the test data below:
/* 1 */
{
"_id" : {
"typ" : "WERBUNG",
"date" : "2017-07-08"
},
"orderQuantity" : 1.0,
"value" : 1000,
"customerQuantity" : 1
}
/* 2 */
{
"_id" : {
"typ" : "WERBUNG",
"date" : "2017-07-07"
},
"orderQuantity" : 2.0,
"value" : 100,
"customerQuantity" : 1
}
/* 3 */
{
"_id" : {
"typ" : "ANDERE",
"date" : "2017-07-08"
},
"orderQuantity" : 4.0,
"value" : 1500,
"customerQuantity" : 4
}
/* 4 */
{
"_id" : {
"typ" : "ANDERE",
"date" : "2017-07-07"
},
"orderQuantity" : 1.0,
"value" : 90,
"customerQuantity" : 1
}
... this would mean:
On 7-7 there where 3 orders (WERBUNG 2, ANDERE 1) for only 1 customer (WERBUNG 1, ANDERE 1 - will be counted twice here, bit this would be okay)
On 8-7 there where 5 orders (WERBUNG 1, ANDERE 4) for 5 customers (WERBUNG 4, ANDERE 1)
I have an idea that SortBy would help here, however we still use 3.2 - so no access to this stage (and some other usefull options as well...).
Cheers!
--
Some information which might help:
// Here are the sample orders:
/*1*/
{
"_id" : ObjectId("596075d5be8fc415341c7d43"),
"header" : {
"kundennummer" : "820130",
"auftragsdatum" : 0,
"bestellangaben" : "BLOCK1",
"information1" : "blocktest",
"erstellungsdatum" : 1499493785.25906,
"vorgabeauftragsnummer" : 87475000,
},
"ordertype" : "BLOCK1",
"customernnummer" : "820130",
"ordernumber" : 87475000,
"positions" : [
"artikelnummer" : 1985900,
"menge" : 1,
"bruttopreis" : 1000,
"_id" : ObjectId("596075d5be8fc415341c7d45")
}
],
"date" : "2017-07-08",
"type" : "WERBUNG"
}
/*2*/
{
"_id" : ObjectId("59608f64be8fc415341c7d46"),
"header" : {
"kundennummer" : "944867",
"auftragsdatum" : 0,
"bestellangaben" : "",
"information1" : "blocktest",
"erstellungsdatum" : 1499500356.10022,
"vorgabeauftragsnummer" : 87475001,
},
"ordertype" : "",
"customernnummer" : "944867",
"ordernumber" : 87475001,
"positions" : [
{
"artikelnummer" : 4029300,
"menge" : 1,
"bruttopreis" : 100,
"_id" : ObjectId("59608f64be8fc415341c7d5c")
}
],
"date" : "2017-07-08",
"type" : "ANDERE"
}
/*3*/
{
"_id" : ObjectId("5960925ebe8fc415341c7d5d"),
"header" : {
"kundennummer" : "981927",
"auftragsdatum" : 0,
"bestellangaben" : "",
"information1" : "blocktest",
"erstellungsdatum" : 1499501036.34265,
"vorgabeauftragsnummer" : 87475002,
},
"ordertype" : "",
"customernnummer" : "981927",
"ordernumber" : 87475002,
"positions" : [
},
"artikelnummer" : 4557300,
"menge" : 2,
"bruttopreis" : 100,
"_id" : ObjectId("5960925ebe8fc415341c7d74")
}
],
"date" : "2017-07-08",
"type" : "ANDERE"
}
/*4*/
{
"_id" : ObjectId("5960925ebe8fc415341c7d75"),
"header" : {
"kundennummer" : "981927",
"auftragsdatum" : 0,
"bestellangaben" : "BLOCK2",
"information1" : "blocktest",
"erstellungsdatum" : 1499414714,
"vorgabeauftragsnummer" : 87475003,
},
"ordertype" : "BLOCK2",
"customernnummer" : "981927",
"ordernumber" : 87475003,
"positions" : [
{ "artikelnummer" : 7081200,
"menge" : 3,
"bruttopreis" : 10,
"_id" : ObjectId("5960925ebe8fc415341c7d8f")
}
],
"date" : "2017-07-07",
"type" : "WERBUNG"
}
/*5*/
{
"_id" : ObjectId("596093ebbe8fc415341c7d90"),
"header" : {
"kundennummer" : "962422",
"auftragsdatum" : 0,
"bestellangaben" : "",
"information1" : "blocktest",
"erstellungsdatum" : 1499501507.75201,
"vorgabeauftragsnummer" : 87475004,
},
"ordertype" : "",
"customernnummer" : "962422",
"ordernumber" : 87475004,
"positions" : [
"artikelnummer" : 3545900,
"menge" : 4,
"bruttopreis" : 100,
"_id" : ObjectId("596093ebbe8fc415341c7d95")
}
],
"date" : "2017-07-08",
"type" : "ANDERE"
}
/*6*/
{
"_id" : ObjectId("596098e9be8fc415341c7ddf"),
"header" : {
"kundennummer" : "981927",
"auftragsdatum" : 0,
"bestellangaben" : "BLOCK3",
"information1" : "blocktest",
"erstellungsdatum" : 1499415886,
"vorgabeauftragsnummer" : 87475007,
},
"ordertype" : "BLOCK3",
"customernnummer" : "981927",
"ordernumber" : 87475007,
"positions" : [
{
"artikelnummer" : 1006199,
"menge" : 7,
"bruttopreis" : 10,
"_id" : ObjectId("596098e9be8fc415341c7de6")
}
],
"date" : "2017-07-07",
"type" : "WERBUNG"
}
/*7*/
{
"_id" : ObjectId("59609a47be8fc415341c7de7"),
"header" : {
"kundennummer" : "981225",
"auftragsdatum" : 0,
"bestellangaben" : "",
"information1" : "blocktest",
"erstellungsdatum" : 1499503113.21714,
},
"ordertype" : "",
"customernnummer" : "981225",
"ordernumber" : 87475008,
"positions" : [
{
"_id": ObjectId("59609a47be8fc415341c7e0d")
"artikelnummer" : 2308400,
"menge" : 8,
"bruttopreis" : 100,
}
],
"date" : "2017-07-08",
"type" : "ANDERE"
}
/*8*/
{
"_id" : ObjectId("59609a47be8fc415341c7e0e"),
"header" : {
"vorgabeauftragsnummer" : 87475009,
"erstellungsdatum" : 1499416697,
"information1" : "blocktest",
"bestellangaben" : "",
"auftragsdatum" : 0,
"kundennummer" : "981927",
},
"ordertype" : "",
"customernnummer" : "981927",
"ordernumber" : 87475009,
"positions" : [
"_id" : ObjectId("59609a47be8fc415341c7e57"),
"bruttopreis" : 10,
"menge" : 9,
"artikelnummer" : 8017000
}
],
"date" : "2017-07-07",
"type" : "ANDERE"
}
// Query 1: Quantity of customers and order value by order type (WERBUNG, ANDERE) and day
db.getCollection('orders').aggregate([
{$unwind:"$positions"},
{$project: {
"_id": 1,
customer: "$header.customernnummer",
date: {$}ToString: {format: "%d-%m-%Y", }: {"$add":[ new }(0), {"$multiply": [1000, "$header.erstellungsdate"]}]} }},
edate: "$header.erstellungsdate",
ordertype: "$header.ordertype",
type: {$cond: { if: {$ne: ["$header.ordertype" ,""]} , then: "WERBUNG", else: "ANDERE" }},
value: {$multiply: ["$positions.price","$positions.quantity"]},
}
},
{$group: {
_id: {type: "$type", tag: "$date",customer: "$customer" },
type: {$first: "$type"},
date: {$first: "$date"},
wert: {$sum: "$value" }
}
},
{$project:{
_id : 0,
customer: "$customer",
type: "$type",
date: "$date",
wert: "$wert"
}
}
,{$group: {
_id: {typ: "$type", date: "$date"}, customerQuantity:{$sum:1},
value: {$sum: "$wert"}
}
},
{$sort:{
typ: 1, date: -1
}
}
]}
// Query 2: Order quantity by type, date
...
{$project: {
block: {$cond: { if: {$ne: ["$auftragskopf.bestellangaben" ,""]} , then: "WERBUNG", else: "ANDERE" }},
datum: {$dateToString: {format: "%d-%m-%Y", date: {"$add":[ new Date(0), {"$multiply": [1000, "$auftragskopf.erstellungsdatum"]}]} }},
}
},
{$group:{
_id: {block: "$block", datum: "$datum"},
auftragsanzahl:{$sum:1},
}
},
I'm not sure about how your data looks like, but I understand from this line
type: {$cond: { if: {$ne: ["$header.ordertype" ,""]} , then: "WERBUNG", else: "ANDERE" }}
that if your $header.ordertype is WERBUNG then you want your order type to be WERBUNG otherwise its ANDERE. With that in mind here is my solution.
db.getCollection('orders').aggregate([
{
$project: {
"_id": 1,
header: 1,
positions: 1,
date: {
$dateToString: {
format: "%Y-%m-%d",
date: {
"$add": [new Date(0), {
"$multiply": [1000, "$header.date"]
}]
}
}
},
type: {
$cond: {
if: {
$eq: ["$header.ordertype", "WERBUNG"]
},
then: "WERBUNG",
else: "ANDERE"
}
}
}
},
{
$group: {
_id: {
type: "$type",
date: "$date"
},
werbungCount: {
$sum: {
$cond: [{
$eq: ['$type', 'WERBUNG']
}, 1, 0]
}
},
andereCount: {
$sum: {
$cond: [{
$eq: ['$type', 'ANDERE']
}, 1, 0]
}
},
customer: {
$first: "$header.customernnummer"
},
date: {
$first: "$date"
},
type: {
$first: "$type"
},
positions: {
$first: "$positions"
}
}
},
{
$unwind: "$positions"
},
{
$project: {
"_id": 1,
customer: "$customer",
date: 1,
ordertype: "$header.ordertype",
type: "$type",
value: {
$multiply: ["$positions.price", "$positions.quantity"]
},
price: "$positions.price",
quantity: "$positions.quantity",
orderQuantity: {
$cond: {
if: {
$eq: ["$type", "WERBUNG"]
},
then: "$werbungCount",
else: "$andereCount"
}
},
}
},
{
$group: {
_id: {
type: "$type",
tag: "$date",
customer: "$customer"
},
type: {
$first: "$type"
},
date: {
$first: "$date"
},
wert: {
$sum: "$value"
},
orderQuantity: {
$first: "$orderQuantity"
}
}
},
{
$group: {
_id: {
typ: "$type",
date: "$date"
},
orderQuantity: {
$first: "$orderQuantity"
},
value: {
$sum: "$wert"
}
}
},
{
$sort: {
typ: 1,
date: -1
}
}
])
I use the first $project to "normalize" the date. So an order placed at 1499415886: Friday, July 7, 2017 8:24:46 AM and 1499415990: Friday, July 7, 2017 8:26:30 AM will both be counted since they are on the same date (cause I could also tell from what you wrote that the time doesn't matter to you, only the date)
In the $group after the $project you just count the documents that have the "ordertype" WERBUNG and set them to the field werbungCount, otherwise if they are empty set them to the field andereCount. The pipeline after the $project and the $group is like you did it. I just corrected some mistakes and changed the name of the fields in some spots. Also added the orderQuantity field with a condition on the second $project.
Hope that works!

Sync API.AI entities from firebase database

I am building a bot for food ordering. I have saved my food categories (i.e Starters, Meals, Fish, Meat, Pasta etc) in firebase and every food-category can have multiple food-items.
here is my food-item structure in firebase
{
"-KauXk2E4ma1u70X-h4g" : {
"food_category" : "fish",
"food_desc" : "Zander filets in pastry made with beer, tartar sauce",
"food_like" : true,
"food_name" : "Homemade fish sticks",
"food_price" : 45,
"food_url" : "https://i.scaley.io/768x687/lebijou.io/_jamesapp/food/airhomeCHZH3/FISH/Homemade fish sticks.jpg"
},
"-KauY4B0pQDgZ_sF92Df" : {
"food_category" : "meat",
"food_desc" : "Rindsfiletmedaillons (160 g), Balsamicosauce, fried potatoes",
"food_like" : true,
"food_name" : "Medallion of beef with balsamic-sauce",
"food_price" : 45,
"food_url" : "https://i.scaley.io/768x687/lebijou.io/_jamesapp/food/airhomeCHZH3/MEAT/Medallion of beef with balsamic-sauce.jpg"
},
"-KauYG4BcYT5EY36TQM7" : {
"food_category" : "fish",
"food_desc" : "",
"food_like" : true,
"food_name" : "Gilt head fillets",
"food_price" : 45,
"food_url" : "https://i.scaley.io/768x687/lebijou.io/_jamesapp/food/airhomeCHZH3/FISH/Gilt head fillets.jpg"
},
"-KauYHAaRDF6ZgAFy7U_" : {
"food_category" : "meat",
"food_desc" : "",
"food_like" : true,
"food_name" : "Münsterhof Food - Burger",
"food_price" : 45,
"food_url" : "https://i.scaley.io/768x687/lebijou.io/_jamesapp/food/airhomeCHZH3/MEAT/Münsterhof Food - Burger.jpg"
},
"-KaucBQaWncdZOc5kat2" : {
"food_category" : "pasta",
"food_desc" : "Rigatoni (type of pasta), tomato, pepper, garlic",
"food_like" : true,
"food_name" : "Arrabbiata",
"food_price" : 45,
"food_url" : "https://i.scaley.io/768x687/lebijou.io/_jamesapp/food/airhomeCHZH3/PASTA/Arrabbiata.jpg"
},
"-KaucGDalMgMRwfCGbvy" : {
"food_category" : "pasta",
"food_desc" : "Tagliatelle (type of pasta), tomato, beef",
"food_like" : true,
"food_name" : "Bolognese",
"food_price" : 45,
"food_url" : "https://i.scaley.io/768x687/lebijou.io/_jamesapp/food/airhomeCHZH3/PASTA/Bolognese.jpg"
},
"-KaucisapOUvgKF2xcjg" : {
"food_category" : "pasta",
"food_desc" : "Rigatoni (type of pasta), tomato, basil",
"food_like" : true,
"food_name" : "Napoli",
"food_price" : 45,
"food_url" : "https://i.scaley.io/768x687/lebijou.io/_jamesapp/food/airhomeCHZH3/PASTA/Napoli.jpg"
},
"-KaucsqbjLVKL3oAv4y3" : {
"food_category" : "pizza",
"food_desc" : "Homemade pizza dough, tomato sauce, mozzarella cheese, ham, free-range egg, pesto",
"food_like" : true,
"food_name" : "Calzone",
"food_price" : 45,
"food_url" : "https://i.scaley.io/768x687/lebijou.io/_jamesapp/food/airhomeCHZH3/PIZZA/Calzone.jpg"
},
"-KaucwQb6NNgkUCg8bZD" : {
"food_category" : "pizza",
"food_desc" : "Homemade pizza dough, tomato sauce, mozzarella cheese, basil",
"food_like" : true,
"food_name" : "Margherita",
"food_price" : 45,
"food_url" : "https://i.scaley.io/768x687/lebijou.io/_jamesapp/food/airhomeCHZH3/PIZZA/Margherita.jpg"
},
"-Kaud7xE6JmQtqhLiLQP" : {
"food_category" : "pizza",
"food_desc" : "Homemade pizza dough, tomato sauce, mozzarella cheese, artichokes, hot pepper, mushrooms, ham, olives, oregano",
"food_like" : true,
"food_name" : "Quattro stagioni",
"food_price" : 45,
"food_url" : "https://i.scaley.io/768x687/lebijou.io/_jamesapp/food/airhomeCHZH3/PIZZA/Quattro stagioni.jpg"
},
"-Kaue7Wzluk-wy9xys5n" : {
"food_category" : "starters",
"food_desc" : "beef-carpaccio, parmesan, olive oil",
"food_like" : true,
"food_name" : "Beef carpaccio",
"food_price" : 45,
"food_url" : "https://i.scaley.io/768x687/lebijou.io/_jamesapp/food/airhomeCHZH3/STARTERS/Beef carpaccio.jpg"
}
}
What I am trying to achieve is to sync my food-items (under food categories) from firebase to api.ai entities.
I have done it manually on api.ai like below
http://prntscr.com/fhpme6
Now want to do all this via code (nodejs)
I know there is api.ai provides apis to play with entities i.e https://docs.api.ai/docs/entities (I have used them)
I can extract food-items from firebase and sync data to api.ai entities. But there is a problem. I can't overwrite and/or delete the entities already used in my intents.
Here is the code I have tried so for FoodSync.js
var util = require('util');
var exec = require('child_process').exec;
var apiai_developer_access_code = "e764e24fa51e4d93bdc1e6bde92d07d8"
class FoodSync {
syncFood(firebaseRef, apiAiRef, callback){
//Retrieves a list of all entities for the agent.
var all_entities = 'curl -k -H "Authorization: Bearer '+apiai_developer_access_code+'" "https://api.api.ai/v1/entities?v=20150910"'
var food_category_id = "";
var all_entities_array = [];
var all_food_cat_names = [];
var all_food_cat_entities_id = [];
var that = this;
that.execCommand(all_entities, function(error, stdout, stderr){
var jsonResponse = JSON.parse(stdout)
all_entities_array = jsonResponse;
for(var i = 0; i < jsonResponse.length; i++) {
var entity = jsonResponse[i];
if(entity.name == "food-categories"){
food_category_id = entity.id;
break;
}
}
console.log('food_category_id: ' + food_category_id);
if(food_category_id == ""){
return;
}
var food_category_entity_url = that.getEntityUrlWithID(food_category_id);
console.log('food_category_entity_url: ' + food_category_entity_url);
that.execCommand(food_category_entity_url, function(error, stdout, stderr){
var jsonResponse = JSON.parse(stdout)
console.log('food_category_entity: ' + JSON.stringify(jsonResponse));
var food_cat_all_entities = jsonResponse.entries;
console.log("food_cat_all_entities : "+ food_cat_all_entities.length);
for(var i = 0; i < food_cat_all_entities.length; i++) {
var food_cat_entity = food_cat_all_entities[i];
var food_cat_entity_name = food_cat_entity.value.replace('#', '')
console.log("food_cat_entity_name : "+ food_cat_entity_name);
console.log("all_entities_array.length : "+ all_entities_array.length);
for(var j = 0; j < all_entities_array.length; j++){
var entity_check = all_entities_array[j];
if(entity_check.name == food_cat_entity_name){
all_food_cat_entities_id.push(entity_check.id);
}
}
// Here I got all food-entities id.
console.log('all_food_cat_entities_ids: ' + all_food_cat_entities_id);
// Here I am calling the DELETE entity first (then Recreate), Problem is my entity is used in My some of Intents, So upon delete I am getting the error,
// {
// "id": "09e2c21d-b4e3-4a34-a774-eab781c957e1",
// "timestamp": "2017-06-09T10:01:19.294Z",
// "lang": "en",
// "status": {
// "code": 400,
// "errorType": "bad_request",
// "errorDetails": "Some entity names are in use: food-cat-fish"
// }
// }
}
})
// if(error !== null){
// console.log('exec error: ' + error);
// callback()
// }
})
}
getEntityUrlWithID(entitiy_id){
// Retrieves the specified entity.
var entities = 'curl -k -H "Authorization: Bearer '+apiai_developer_access_code+'" "https://api.api.ai/v1/entities/'+entitiy_id+'?v=20150910"'
return entities;
}
execCommand(command, callback){
console.log('execCommand: called');
exec(command, function(error, stdout, stderr){
//console.log('stdout: ' + stdout);
//console.log('stderr: ' + stderr);
if(error !== null){
console.log('exec error: ' + error);
}
callback(error, stdout, stderr)
});
}
}
module.exports = FoodSync

Resources