UnboundLocalError: local variable 'key_schema' referenced before assignment - amazon-dynamodb-dax

Im trying to integrate DAX with Pynamodb(working on an existing pull request). While testing the batch write method from aws DAX sdk I'm facing the below issue. Any help is appreciated.
/.virtualenvs/PynamoDB/lib/python2.7/site-packages/amazondax/DaxClient.pyc in batch_write_item(self, **kwargs)
98 Stubs.write_batchWriteItem_116217951_1(request, tube)
99
--> 100 result = self._decode_result('BatchWriteItem', request, Assemblers.batchWriteItem_116217951_1, tube)
101 result = self._resolve_attribute_values(result, tube)
102 return result
/.virtualenvs/PynamoDB/lib/python2.7/site-packages/amazondax/DaxClient.pyc in _decode_result(self, operation_name, request, assembler, tube)
223 if status == DaxClient.SUCCESS:
224 tube.skip() # Throw away the empty error header
--> 225 return assembler(request, tube)
226 else:
227 return self._handle_error(operation_name, tube)
/.virtualenvs/PynamoDB/lib/python2.7/site-packages/amazondax/Assemblers.pyc in batchWriteItem_116217951_1(request, tube)
115
116 for _ in range(num_items, 0, -2):
--> 117 key = AttributeValueDecoder._decode_key_bytes(_dec, key_schema)
118 if _dec.try_decode_null():
119 # DeleteRequest
UnboundLocalError: local variable 'key_schema' referenced before assignment

This is a known issue that I hope to have a fix out for shortly.
This is fixed in version 1.0.4 of the DAX Python client.

Related

Geting error Caused by: com.databricks.NotebookExecutionException: FAILED

I am trying to run the below notebook through databricks but getting the below error. I have tried to update the notebook timeout and the retry mechanism but still no luck yet.
NotebookData("/Users/mynotebook",9900, retry=3)
]
res = parallelNotebooks(notebooks, 2)
result = [f.result(timeout=9900) for f in res] # This is a blocking call.
print(result)
Can someone please help me to sort out this issue? Thanks
%python
from concurrent.futures import ThreadPoolExecutor
class NotebookData:
def __init__(self, path, timeout, parameters=None, retry=0):
self.path = path
self.timeout = timeout
self.parameters = parameters
self.retry = retry
def submitNotebook(notebook):
print("Running notebook %s" % notebook.path)
try:
if (notebook.parameters):
return dbutils.notebook.run(notebook.path, notebook.timeout, notebook.parameters)
else:
return dbutils.notebook.run(notebook.path, notebook.timeout)
except Exception:
if notebook.retry < 1:
raise
print("Retrying notebook %s" % notebook.path)
notebook.retry = notebook.retry - 1
submitNotebook(notebook)
def parallelNotebooks(notebooks, numInParallel):
# This code limits the number of parallel notebooks.
with ThreadPoolExecutor(max_workers=numInParallel) as ec:
return [ec.submit(submitNotebook, notebook) for notebook in notebooks]
notebooks = [
NotebookData("/Users/mynotebook",1200000, retry=0)
]
res = parallelNotebooks(notebooks, 2)
result = [f.result(timeout=1200000) for f in res] # This is a blocking call.
print(result)
Error:
Py4JJavaError Traceback (most recent call last)
<command-1143841910698378> in <module>
32 ]
33 res = parallelNotebooks(notebooks, 2)
---> 34 result = [f.result(timeout=1200000) for f in res] # This is a blocking call.
35 print(result)
<command-1143841910698378> in <listcomp>(.0)
32 ]
33 res = parallelNotebooks(notebooks, 2)
---> 34 result = [f.result(timeout=1200000) for f in res] # This is a blocking call.
35 print(result)
/usr/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
426 raise CancelledError()
427 elif self._state == FINISHED:
--> 428 return self.__get_result()
429
430 self._condition.wait(timeout)
/usr/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
/usr/lib/python3.7/concurrent/futures/thread.py in run(self)
55
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)
<command-1143841910698378> in submitNotebook(notebook)
12 return dbutils.notebook.run(notebook.path, notebook.timeout, notebook.parameters)
13 else:
---> 14 return dbutils.notebook.run(notebook.path, notebook.timeout)
15 except Exception:
16 if notebook.retry < 1:
/local_disk0/tmp/1664351986642-0/dbutils.py in run(self, path, timeout_seconds, arguments, _NotebookHandler__databricks_internal_cluster_spec)
136 arguments,
137 __databricks_internal_cluster_spec,
--> 138 self.shell.currentJobGroup)
139
140 def __repr__(self):
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
125 def deco(*a, **kw):
126 try:
--> 127 return f(*a, **kw)
128 except py4j.protocol.Py4JJavaError as e:
129 converted = convert_exception(e.java_exception)
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o1741._run.
: com.databricks.WorkflowException: com.databricks.NotebookExecutionException: FAILED
at com.databricks.workflow.WorkflowDriver.run(WorkflowDriver.scala:95)
at com.databricks.dbutils_v1.impl.NotebookUtilsImpl.run(NotebookUtilsImpl.scala:122)
at com.databricks.dbutils_v1.impl.NotebookUtilsImpl._run(NotebookUtilsImpl.scala:89)
at sun.reflect.GeneratedMethodAccessor820.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
Caused by: com.databricks.NotebookExecutionException: FAILED
at com.databricks.workflow.WorkflowDriver.run0(WorkflowDriver.scala:141)
at com.databricks.workflow.WorkflowDriver.run(WorkflowDriver.scala:90)
... 12 more

RuntimeError: Error loading state dict for SrlBert Missing keys: ['bert_model.embeddings.position_ids'] Unexpected keys: []

I am just a beginner in NLP and was trying to learn the Semantic role labeling concept through implementation.
I was trying to load the bert-base-srl model from the public storage of allennlp.
But was facing the following error:
from allennlp.predictors.predictor import Predictor
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/bert-base-srl-2020.03.24.tar.gz")
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_11672/96061884.py in <module>
1 from allennlp.predictors.predictor import Predictor
----> 2 predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/bert-base-srl-2020.03.24.tar.gz")
~\anaconda3\lib\site-packages\allennlp\predictors\predictor.py in from_path(cls, archive_path, predictor_name, cuda_device, dataset_reader_to_load, frozen, import_plugins, overrides, **kwargs)
364 plugins.import_plugins()
365 return Predictor.from_archive(
--> 366 load_archive(archive_path, cuda_device=cuda_device, overrides=overrides),
367 predictor_name,
368 dataset_reader_to_load=dataset_reader_to_load,
~\anaconda3\lib\site-packages\allennlp\models\archival.py in load_archive(archive_file, cuda_device, overrides, weights_file)
233 config.duplicate(), serialization_dir
234 )
--> 235 model = _load_model(config.duplicate(), weights_path, serialization_dir, cuda_device)
236
237 # Load meta.
~\anaconda3\lib\site-packages\allennlp\models\archival.py in _load_model(config, weights_path, serialization_dir, cuda_device)
277
278 def _load_model(config, weights_path, serialization_dir, cuda_device):
--> 279 return Model.load(
280 config,
281 weights_file=weights_path,
~\anaconda3\lib\site-packages\allennlp\models\model.py in load(cls, config, serialization_dir, weights_file, cuda_device)
436 # get_model_class method, that recurses whenever it finds a from_archive model type.
437 model_class = Model
--> 438 return model_class._load(config, serialization_dir, weights_file, cuda_device)
439
440 def extend_embedder_vocab(self, embedding_sources_mapping: Dict[str, str] = None) -> None:
~\anaconda3\lib\site-packages\allennlp\models\model.py in _load(cls, config, serialization_dir, weights_file, cuda_device)
378
379 if unexpected_keys or missing_keys:
--> 380 raise RuntimeError(
381 f"Error loading state dict for {model.__class__.__name__}\n\t"
382 f"Missing keys: {missing_keys}\n\t"
RuntimeError: Error loading state dict for SrlBert
Missing keys: ['bert_model.embeddings.position_ids']
Unexpected keys: []
Does someone know a fix for this?
If you are on the later versions of allennlp-models, you can use this archive_file instead: https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz.
The latest versions of the model archive files can be found on the demo page in the Model Card tab: https://demo.allennlp.org/semantic-role-labeling

AttributeError: 'datetime.timedelta' object has no attribute '_get_object_id' : pyspark

I'm trying to modify date column.
Code is below:
sample = sample.withColumn('next_date', when(sample.next_date.isNull(), (sample['next_date'] + timedelta(days=1))).otherwise(sample['next_date']))
Its giving me following error:
AttributeError Traceback (most recent call last)
<ipython-input-127-dd09f90d8a49> in <module>()
6 sample = sample.withColumn('next_date', lead('date').over(windowSpecs))
7
----> 8 sample = sample.withColumn('next_date', when(sample.next_date.isNull(), (sample['next_date'] + timedelta(days=1))).otherwise(sample['next_date']))
9
10 sample = sample.withColumn('snapshot_date', lit(dt.datetime.now().strftime("%d-%m-%Y %H:%M")))
/usr/lib/spark/python/pyspark/sql/column.py in _(self, other)
108 def _(self, other):
109 jc = other._jc if isinstance(other, Column) else other
--> 110 njc = getattr(self._jc, name)(jc)
111 return Column(njc)
112 _.__doc__ = doc
/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
802
803 args_command = "".join(
--> 804 [get_command_part(arg, self.pool) for arg in new_args])
805
806 command = proto.CALL_COMMAND_NAME +\
/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_command_part(parameter, python_proxy_pool)
276 command_part += ";" + interface
277 else:
--> 278 command_part = REFERENCE_TYPE + parameter._get_object_id()
279
280 command_part += "\n"
AttributeError: 'datetime.timedelta' object has no attribute '_get_object_id'
How do I resolve this?
Thanks in advance!
I know this is very old, but I solved the issue doing this:
sample = sample.withColumn('next_date', when(sample.next_date.isNull(), date_add(col('next_date'), 1).otherwise(sample['next_date']))
Hope this helps someone!

Using R's Digest Package to Sign an HTTP request to Amazon Dynamodb's API

I trying to use Rs Digest package to create a signature for Amazon's API for Dynamodb. The signature requires sha256. Currently, I am testing R's digest package to see if its yield the correct output, given my input.
According to the Amazon website example, if my input is:
Input: "iam"
then my output should be:
Targeted Output: 'f72cfd46f26bc4643f06a11eabb6c0ba18780c19a8da0c31ace671265e3c87fa'
When I use the following R command:
digest("iam", algo="sha256", serialize=FALSE)
I get the following output:
"d457e3a99392a03f47057f50ac1cbc5d0365131575477971bf85177a0c0fed22"
I've tried various input combinations (setting serialize=TRUE, etc.) but have not had any luck.
Update
Per Rohit's response, I updated my R function and approach but I'm still not getting the correct Sample Signature. Here are my steps:
Based on Amazon Calculate AWS Signature Example
Now using the following R formula/script:
hmac(hmac(hmac(hmac("AWS4wJalrXUtnFEMI/K7MDENG+bPxRfiCYEXAMPLEKEY","20110909", "sha256", serialize=FALSE, raw=FALSE),"us-east-1", "sha256", serialize=FALSE, raw=FALSE),"iam", "sha256", serialize=FALSE, raw=FALSE),"aws4_request", "sha256", serialize=TRUE, raw=TRUE)
Getting this result:
fe bd 15 b6 ac 8d 68 7a 93 f9 1c 9c dc 9e f8 d9 f1 79 fb a8 62 71 14 98 3a 35 0c 09 a0 ea 2e f5
that does not match the Sample Signature in the Amazon example:
152 241 216 137 254 196 244 66 26 220 82 43 171 12 225 248 46 105 41 194 98 237 21 229 169 76 144 239 209 227 176 231
I tried changing parameters on my R function, but just can't seem to match the Amazon example. If anyone has some experience with this or seems something I'm doing wrong I'd appreciate your input. Thanks
I think there are two places where the problem lies.
Firstly, the AWS v4 Signature is an HMAC using the AWS secret key as a secret (among other things). The HMAC process uses a cryptographic hash like MD5 or SHA256, but it not just a hash of a single piece of data ("iam" in your case) - it also needs a 'secret'. I guess you would be more interested in the hmac function in R - it can use SHA256 as an 'algo'.
Secondly, if you look at the Java example of how a signature is calculated and the expected values:
static byte[] HmacSHA256(String data, byte[] key) throws Exception {
String algorithm="HmacSHA256";
Mac mac = Mac.getInstance(algorithm);
mac.init(new SecretKeySpec(key, algorithm));
return mac.doFinal(data.getBytes("UTF8"));
}
static byte[] getSignatureKey(String key, String dateStamp, String regionName, String serviceName) throws Exception {
byte[] kSecret = ("AWS4" + key).getBytes("UTF8");
byte[] kDate = HmacSHA256(dateStamp, kSecret);
byte[] kRegion = HmacSHA256(regionName, kDate);
byte[] kService = HmacSHA256(serviceName, kRegion);
byte[] kSigning = HmacSHA256("aws4_request", kService);
return kSigning;
}
Expected Values
kSecret = '41575334774a616c725855746e46454d492f4b374d44454e472b62507852666943594558414d504c454b4559'
kDate = '969fbb94feb542b71ede6f87fe4d5fa29c789342b0f407474670f0c2489e0a0d'
kRegion = '69daa0209cd9c5ff5c8ced464a696fd4252e981430b10e3d3fd8e2f197d7a70c'
kService = 'f72cfd46f26bc4643f06a11eabb6c0ba18780c19a8da0c31ace671265e3c87fa'
kSigning = 'f4780e2d9f65fa895f9c67b32ce1baf0b0d8a43505a000a1a9e090d414db404d'
You see that the kService is calculated using kRegion, which calculated using kDate, which itself is calculated using the AWS secret key. So you would have to do something similar to calculate the signature.

XLDateAmbiguous error even when using dayfirst argument

I'm trying to import data into a pandas dataframe object from an excel spreadsheet parsing dates. I'm using dayfirst however I still get an error XLDateAmbiguous (docs)
The dates are in a single column in the format 25/09/1990
Could somebody explain to me why this happening and how I can fix it? Thanks in advance.
Edit: It seems as though the problem is caused by xlrd attempting to parse a non-date column as a date even thought I've specified which column the dates are in. Unfortunately I don't know how to explicitly indicate that a column should not be parsed as dates. Does anybody have any ideas?
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import dateutil
path6 = 'C:\\Users\\Site2_Homepage_2013-06-04.xlsx'
df8 = pd.io.excel.read_excel(path6, 'Site2_Homepage_2012_06_13', header=1, parse_dates=True, dayfirst=True)
XLDateAmbiguous Traceback (most recent call last)
<ipython-input-17-4a83d104ab72> in <module>()
4 path7 = 'C:\\Users\\Site4_Homepage_2013-06-04.xlsx'
5 path8 = 'C:\\Users\\Site7_Homepage_2013-06-04.xlsx'
----> 6 df8 = pd.io.excel.read_excel(path6, 'Site2_Homepage_2012_06_13', header=1, parse_dates=True, dayfirst=True)
7 df9 = pd.io.excel.read_excel(path7, 'Site4_Homepage_2012_06_13', header=1, parse_dates=[3], dayfirst=True)
8 df10 = pd.io.excel.read_excel(path8, 'Site7_Homepage_2012_06_13', header=1, parse_dates=[3], dayfirst=True)
C:\Users\AppData\Local\Enthought\Canopy32\User\lib\site-packages\pandas\io\excel.pyc in read_excel(io, sheetname, **kwds)
101 engine = kwds.pop('engine', None)
102
--> 103 return ExcelFile(io, engine=engine).parse(sheetname=sheetname, **kwds)
104
105
C:\Users\AppData\Local\Enthought\Canopy32\User\lib\site-packages\pandas\io\excel.pyc in parse(self, sheetname, header, skiprows, skip_footer, index_col, parse_cols, parse_dates, date_parser, na_values, thousands, chunksize, convert_float, has_index_names, **kwds)
206 skip_footer=skip_footer,
207 convert_float=convert_float,
--> 208 **kwds)
209
210 def _should_parse(self, i, parse_cols):
C:\Users\AppData\Local\Enthought\Canopy32\User\lib\site-packages\pandas\io\excel.pyc in _parse_excel(self, sheetname, header, skiprows, skip_footer, index_col, has_index_names, parse_cols, parse_dates, date_parser, na_values, thousands, chunksize, convert_float, **kwds)
267 if parse_cols is None or should_parse[j]:
268 if typ == XL_CELL_DATE:
--> 269 dt = xldate_as_tuple(value, datemode)
270 # how to produce this first case?
271 if dt[0] < datetime.MINYEAR: # pragma: no cover
C:\Users\AppData\Local\Enthought\Canopy32\User\lib\site-packages\xlrd\xldate.pyc in xldate_as_tuple(xldate, datemode)
78
79 if xldays < 61 and datemode == 0:
---> 80 raise XLDateAmbiguous(xldate)
81
82 jdn = xldays + _JDN_delta[datemode]
XLDateAmbiguous: 15.3
I didn't manage to find a solution of this. In the end I had to use .csv versions of the files for the dates to parse correctly.

Resources