aws glue job to import dynamodb data - amazon-dynamodb

We are trying to do DynamoDB migration from prod account to stage account.
In the source account, we are making use of "Export" feature of DDB to put the compressed .json.gz files into destination S3 bucket.
We have written a glue script which will read the exported .json.gz files and writes it to DDB table.
We are making the code generic, so we should be able to migrate any DDB table from prod to stage account.
As part of that process, while testing we are facing issues when we are trying to write a NUMBER SET data to target DDB table.
Following is the sample snippet which is raising ValidationException when trying to insert into DDB
from decimal import Decimal
def number_set(datavalue):
# datavalue will be ['0', '1']
set_of_values = set()
for value in datavalue:
set_of_values.add(Decimal(value))
return set_of_values
When running the code, we are getting following ValidationException
An error occurred while calling o82.pyWriteDynamicFrame. Supplied AttributeValue is empty, must contain exactly one of the supported datatypes (Service: AmazonDynamoDBv2; Status Code: 400; Error Code: ValidationException; Request ID: UKEU70T0BLIKN0K2OL4RU56TGVVV4KQNSO5AEMVJF66Q9ASUAAJG; Proxy: null)
However, if instead of Decimal(value) if we use int(value) then no ValidationException is being thrown and the job succeeds.
I feel that write_dynamic_frame_from_options will try to infer schema based on the values the element contains, if the element has "int" values then the datatype would be "NS", but if the element contains all "Decimal type" values, then it is not able to infer the datatype.
The glue job we have written is
dyf = glue_context.create_dynamic_frame_from_options(
connection_type="s3",
connection_options={
"paths": [file_path]
},
format="json",
transformation_ctx = "dyf",
recurse = True,
)
def number_set(datavalue):
list_of_values = []
for value in datavalue:
list_of_values.append(Decimal(value))
print("list of values ")
print(list_of_values)
return set(list_of_values)
def parse_list(datavalue):
list_of_values = []
for object in datavalue:
list_of_values.append(generic_conversion(object))
return list_of_values
def generic_conversion(value_dict):
for datatype,datavalue in value_dict.items():
if datatype == 'N':
value = Decimal(datavalue)
elif datatype == 'S':
value = datavalue
elif datatype == 'NS':
value = number_set(datavalue)
elif datatype == 'BOOL':
value = datavalue
elif datatype == 'M':
value = construct_map(datavalue)
elif datatype == 'B':
value = datavalue.encode('ascii')
elif datatype == 'L':
value = parse_list(datavalue)
return value
def construct_map(row_dict):
ddb_row = {}
for key,value_dict in row_dict.items():
# value is a dict with key as N or S
# if N then use Decimal type
ddb_row[key] = generic_conversion(value_dict)
return ddb_row
def map_function(rec):
row_dict = rec["Item"]
return construct_map(row_dict)
mapped_dyF = Map.apply(frame = dyf, f = map_function, transformation_ctx = "mapped_dyF")
datasink2 = glue_context.write_dynamic_frame_from_options(
frame=mapped_dyF,
connection_type="dynamodb",
connection_options={
"dynamodb.region": "us-east-1",
"dynamodb.output.tableName": destination_table,
"dynamodb.throughput.write.percent": "0.5"
},
transformation_ctx = "datasink2"
)
can anyone help us in how can we unblock from this situation?
Record that we are trying to insert
{
"region": {
"S": "to_delete"
},
"date": {
"N": "20210916"
},
"number_set": {
"NS": [
"0",
"1"
]
},
"test": {
"BOOL": false
},
"map": {
"M": {
"test": {
"S": "value"
},
"test2": {
"S": "value"
},
"nestedmap": {
"M": {
"key": {
"S": "value"
},
"nestedmap1": {
"M": {
"key1": {
"N": "0"
}
}
}
}
}
}
},
"binary": {
"B": "QUFBY2Q="
},
"list": {
"L": [
{
"S": "abc"
},
{
"S": "def"
},
{
"N": "123"
},
{
"M": {
"key2": {
"S": "value2"
},
"nestedmaplist": {
"M": {
"key3": {
"S": "value3"
}
}
}
}
}
]
}
}

Related

Dynamically Parse Child Nodes in JSON

I have a deserialized object that I want to dynamically loop through to return the related results. The response package looks like so:
{"RatingResponse":
{"Success":"true",
"Message":"",
"QuoteID":"57451",
"LoadNum":"57451",
"Rates":
{"Rate":
[
{"SCAC":"test1",
"CarrierName":"TEST1",
"TransitTime":"1",
"ServiceLevel":"D",
"TotalCost":"1,031.82",
"ThirdPartyCharge":"1,031.82",
"Accessorials":
{"Accessorial":
[
{"Code":"400",
"Cost":"1,655.55",
"Description":"Freight"
},
{"Code":"DSC",
"Cost":"-952.77",
"Description":"Discount"
},
{"Code":"FUE",
"Cost":"329.04",
"Description":"Fuel Surcharge"
}
]
},
"QuoteNumber":""
},
{"SCAC":"test2",
"CarrierName":"TEST2",
"TransitTime":"1",
"ServiceLevel":"D",
"TotalCost":"1,031.82",
"ThirdPartyCharge":"1,031.82",
"Accessorials":
{"Accessorial":
[
{"Code":"400",
"Cost":"1,655.55",
"Description":"Freight"
},
{"Code":"DSC",
"Cost":"-952.77",
"Description":"Discount"
},
{"Code":"FUE",
"Cost":"329.04",
"Description":"Fuel Surcharge"
}
]
},
"QuoteNumber":""
}
]
},
"AverageTotalCost":"1,031.82"
}
}
I have parsed the response data so that there is less information to work with, especially since I only need the Accessorial Costs. The parsed response looks like
[
{
"SCAC": "test1",
"CarrierName": "TEST1",
"TransitTime": "1",
"ServiceLevel": "D",
"TotalCost": "1,031.82",
"ThirdPartyCharge": "1,031.82",
"Accessorials": {
"Accessorial": [
{
"Code": "400",
"Cost": "1,655.55",
"Description": "Freight"
},
{
"Code": "DSC",
"Cost": "-952.77",
"Description": "Discount"
},
{
"Code": "FUE",
"Cost": "329.04",
"Description": "Fuel Surcharge"
}
]
},
"QuoteNumber": ""
},
{
"SCAC": "test2",
"CarrierName": "TEST2",
"TransitTime": "1",
"ServiceLevel": "D",
"TotalCost": "1,031.82",
"ThirdPartyCharge": "1,031.82",
"Accessorials": {
"Accessorial": [
{
"Code": "400",
"Cost": "1,655.55",
"Description": "Freight"
},
{
"Code": "DSC",
"Cost": "-952.77",
"Description": "Discount"
},
{
"Code": "FUE",
"Cost": "329.04",
"Description": "Fuel Surcharge"
}
]
},
"QuoteNumber": ""
}
]
The problem I am facing is that I will never know how many Rate items will come back in the response data, nor will I know the exact amount of Accessorial Costs. I'm hoping to capture the Rate child node counts and the Accessorial child node counts per Rate. Here's what I have so far.
Root rootObject = Newtonsoft.Json.JsonConvert.DeserializeObject<Root>(responseFromServer);
//rate stores the parsed response data
JArray rate = (JArray)JObject.Parse(responseFromServer)["RatingResponse"]["Rates"]["Rate"];
var rate2 = rate.ToString();
//this for loop works as expected. it grabs the number of Rate nodes (in this example, 2)
for (int i = 0; i < rate.Count(); i++)
{
dynamic test2 = rate[i];
//this is where I'm struggling
dynamic em = (JArray)JObject.Parse(test2)["Accessorials"]["Accessorial"].Count();
for (int j = 0; j < em; j++)
{
string test3 = test2.Accessorials.Accessorial[j].Cost;
System.IO.File.AppendAllText(logPath, Environment.NewLine + test3 + Environment.NewLine);
}
}
I apologize in advance for the bad formatting and odd variable names - I'm obviously still testing the functionality, so I've been using random variables.
Where I'm struggling (as notated above) is getting to the Accessorial node to count how many items are in its array. I was thinking I could parse the first array (starting with SCAC data) and extend down to the Accessorial node, but I'm not having any luck.
Any help is GREATLY appreciated, especially since I am new to this type of code and have spent the majority of the day trying to resolve this.
you can try this
var rates = (JArray)JObject.Parse(json)["RatingResponse"]["Rates"]["Rate"];
var costs = rates.Select(r => new
{
CarrierName = r["CarrierName"],
Costs = ((JArray)((JObject)r["Accessorials"])["Accessorial"])
.Where(r => (string)r["Description"] != "Discount")
.Select(r => (double)r["Cost"]).Sum()
}).ToList();
result
[
{
"CarrierName": "TEST1",
"Costs": 1984.59
},
{
"CarrierName": "TEST2",
"Costs": 1984.59
}
]

DynamoDB: list_append alternative for sets

I am trying to do an update operation on a dynamodb string set attribute. For lists, the operation would be
set #key = list_append(if_not_exists(#key, :empty_list), :newValue)
But this produces a list attribute. Is there an alternative for list_append but for sets?
Since DynamoDB can't store empty sets this is actually fairly easy, you can just use the ADD operator.
Here's an example I've built in Python:
import boto3
TABLE_NAME = "set-demo"
def create_table():
ddb = boto3.client("dynamodb")
ddb.create_table(
AttributeDefinitions=[
{"AttributeName": "PK", "AttributeType": "S"},
{"AttributeName": "SK", "AttributeType": "S"}
],
TableName=TABLE_NAME,
KeySchema=[
{"AttributeName": "PK", "KeyType": "HASH"},
{"AttributeName": "SK", "KeyType": "RANGE"}
],
BillingMode="PAY_PER_REQUEST"
)
def add_to_set(item_id: str, value: str):
table = boto3.resource("dynamodb").Table(TABLE_NAME)
table.update_item(
Key={
"PK": f"ITEM#{item_id}",
"SK": f"METADATA",
},
UpdateExpression="ADD #set_name :set_value",
ExpressionAttributeNames={
"#set_name": "values"
},
ExpressionAttributeValues={
":set_value": {value}, # needs to be a set type
}
)
if __name__ == "__main__":
# create_table()
add_to_set("a", "value_1")
add_to_set("a", "value_2")
add_to_set("a", "value_1")
In python it's sufficient to pass a value with the datatype set in the ExpressionAttributeValues for boto3 to know it needs to convert it into a set under the hood.
When I call add_to_set for the first time, it will create the set attribute and subsequent calls are just updates to the attribute.
This is what the item looks like in the end:
{
"PK": {
"S": "ITEM#a"
},
"SK": {
"S": "METADATA"
},
"values": {
"SS": [
"value_1",
"value_2"
]
}
}

JQ Cross reference or how to replace one value with another part of the input

I want to parse terraform.tfstate (where openstack provider is used), to return instance name and it's internal + floating IP (if assigned).
First select what we are interested in:
jq -r '.modules?[]|.resources[]?|select(.type == "openstack_compute_floatingip_v2", .type == "openstack_compute_instance_v2")' < terraform.tfstate
For simplicity, pre-parsed example with the above part (one FIP and one instance):
{
"type": "openstack_compute_floatingip_v2",
"depends_on": [
"openstack_networking_router_interface_v2.management"
],
"primary": {
"id": "48b039fc-a9fa-4672-934a-32d6d267f280",
"attributes": {
"address": "209.66.89.143",
"fixed_ip": "10.10.10.5",
"id": "48b039fc-a9fa-4672-934a-32d6d267f280",
"instance_id": "597e75e8-834d-4f05-8408-e2e6e733577e",
"pool": "public",
"region": "RegionOne"
},
"meta": {},
"tainted": false
},
"deposed": [],
"provider": "provider.openstack"
}
{
"type": "openstack_compute_instance_v2",
"depends_on": [
"openstack_compute_floatingip_v2.management",
"openstack_compute_secgroup_v2.ssh_only",
"openstack_networking_network_v2.management"
],
"primary": {
"id": "597e75e8-834d-4f05-8408-e2e6e733577e",
"attributes": {
"access_ip_v4": "10.10.10.5",
"access_ip_v6": "",
"all_metadata.%": "1",
"all_metadata.habitat": "sup",
"availability_zone": "nova",
"flavor_id": "eb36e84e-17c1-42ab-b359-4380f6f524ae",
"flavor_name": "m1.large",
"force_delete": "false",
"id": "597e75e8-834d-4f05-8408-e2e6e733577e",
"image_id": "c574aeed-e47c-4fb7-9da0-75550b76ee56",
"image_name": "ubuntu-16.04",
"key_pair": "vault-etcd_test_tf",
"metadata.%": "1",
"metadata.habitat": "sup",
"name": "ctl01",
"network.#": "1",
"network.0.access_network": "false",
"network.0.fixed_ip_v4": "10.10.10.5",
"network.0.fixed_ip_v6": "",
"network.0.floating_ip": "",
"network.0.mac": "02:c6:61:f9:ee:7e",
"network.0.name": "management",
"network.0.port": "",
"network.0.uuid": "f2468669-e321-4eb4-9ede-003e362a8988",
"region": "RegionOne",
"security_groups.#": "1",
"security_groups.1845949017": "vault-etcd_test_ssh_only",
"stop_before_destroy": "false"
},
"meta": {
"e2bfb730-ecaa-11e6-8f88-34363bc7c4c0": {
"create": 1800000000000,
"delete": 1800000000000,
"update": 1800000000000
}
},
"tainted": false
},
"deposed": [],
"provider": "provider.openstack"
}
Required is to take from "type": "openstack_compute_floatingip_v2" replace .primary.attributes.address and .fixed_ip and from corresponding .instance_id the .name.
So, sth like:
{"address": "209.66.89.143",
"fixed_ip": "10.10.10.5",
"name": "ctl01"}
Well, I came with an idea while using walk, but miss how to actually assign the proper value from corresponding instance id:
jq -r "$(cat floating.jq)" terraform.tfstate
floating.jq:
def walk(f):
. as $in
| if type == "object" then
reduce keys[] as $key
( {}; . + { ($key): ($in[$key] | walk(f)) } ) | f
elif type == "array" then map( walk(f) ) | f
else f
end;
.modules?[]|.resources[]?|select(.type ==
"openstack_compute_floatingip_v2", .type ==
"openstack_compute_instance_v2")|
.primary|walk( if type == "object" and .attributes.address then
.attributes.instance_id |= "REFERRED VALUE HERE") else . end)
Let's assume the two related objects are in a file named two.json. Then one way to merge the information from both objects is using the -s command-line option, e.g.
jq -s '
(.[0].primary.attributes | {address, fixed_ip})
+ {name: .[1].primary.attributes.name}' two.json
Output
With your example input, the output would be:
{
"address": "209.66.89.143",
"fixed_ip": "10.10.10.5",
"name": "ctl01"
}

Create an object with specified indexes

I am trying to use for loop for every object using jq.
Sample Input generated by Elasticsearch
{
"took": 202,
"timed_out": false,
"aggregations": {
"aggsDateHistogram": {
"buckets": [
{
"key": 1465974236000,
"search": {
"value": 14
}
},
{
"key": 1465975137000,
"search": {
"value": 16
}
}
]
}
}
}
I want to have an object that has a key value and corresponding value of value index from search.
{ "date": .aggregations.aggsDateHistogram.buckets[].key, "value": .aggregations.aggsDateHistogram.buckets[].search.value }
This gives me an object but with cartesian product, but I only want to have values like
key[1] : search[1].value
key[2] : search[2].value
So you want to produce this output?
[
{
"key": 1465974236000,
"value": 14
},
{
"key": 1465975137000,
"value": 16
}
]
The following will do just that:
.aggregations[].buckets
| map({key: .key, value: .search.value})
And from a terminal:
jq '.aggregations[].buckets
| map({key: .key, value: .search.value})' input.json
Here is a slightly simpler solution
[ .aggregations[].buckets[] | {key, value:.search.value} ]

Groovy recursive deep object compare not reporting failures

The answer to this question can be found in this link provided by #bdkosher:
kousenit.wordpress.com/2014/04/16/the-closure-of-no-return
I am attempting to write a recursive function that compares to complex objects (JSON objects) and reports on the differences of each. The code outputs correctly but the failure is not reported as subsequent comparisons take precedent over an earlier failure.
I'm new to groovy so I'm sure there are groovier ways to write it.
The code:
public boolean diffObjects(Object left, Object right)
{
if (left == null || right == null) {
println "Object comparison failure: One or both object null."
return false
}
if (left.getClass() != right.getClass()) {
println "Object comparison failure: Mismatch object classes."
return false
}
if (isArray(left)) {
if (left.size() != right.size()) {
println "Array comparison failure: Object size mismatch."
println "Left has " + left.size() + " items. Right has " + right.size() + " items."
println "Left Object:"
println left
println "Right Object:"
println right
return false
}
for(int i=0; i < left.size(); i++) {
// May detect matching items here if sort of objects is problem
diffObjects(left[i], right[i])
}
} else if (isLazyMap(left)) {
String[] leftKeys = left.keySet()
String[] rightKeys = right.keySet()
if (leftKeys != rightKeys) {
println "Map comparison failure: Left keys do not match right keys."
println "Left " + leftKeys.toString()
println "Right " + rightKeys.toString()
return false
}
leftKeys.each {
if (isArray(left[it])) {
// May detect matching items here if sort of objects is problem
diffObjects( left[it], right[it])
} else {
if (isValue(left[it])) {
if (left[it].toString() != right[it].toString()) {
println "String comparison failure: Left " + it + " value does not match right value."
println "Left " + left[it]
println "Right " + right[it]
return false
}
}
}
}
}
}
Sample data:
[
{
"productId": "141810",
"sizes": [
{
"sku": "11926",
"size": "L",
"gtin": "008206",
"localizedSize": "L",
"skuCountryItems": [
{
"country": "CN",
"commodityCode": null,
"vat": 17
}
],
"available": false
},
{
"sku": "1192",
"size": "M",
"gtin": "0082065234",
"localizedSize": "M",
"skuCountryItems": [
{
"country": "CN",
"commodityCode": null,
"vat": 17
}
],
"available": false
},
{
"sku": "1192",
"size": "S",
"gtin": "0082065234",
"localizedSize": "S",
"skuCountryItems": [
{
"country": "CN",
"commodityCode": null,
"vat": 17
}
],
"available": false
},
{
"sku": "1192",
"size": "XL",
"gtin": "0082065234",
"localizedSize": "XL",
"skuCountryItems": [
{
"country": "CN",
"commodityCode": null,
"vat": 17
}
],
"available": true
}
]
}
]
Any help would be appreciated. Thanks in advance.
Here's an alternative method:
def a = ['a', 'b', 'c']
def b = ['a', 'z', 'c']
def c = [
[sku: 'abc', size: 'L'],
[sku: '123', size: 'S']
]
def d = [
[sku: 'abc', size: 'L'],
[sku: 'xyz', size: 'S']
]
use(DiffMixin) {
assert a.diff(b) == [
[
[self:'b', other:'z'],
[self:[name:'bytes', value:[98]], other:[name:'bytes', value:[122]]]
]
]
println c.diff(d)
}
class DiffMixin {
static List diff(Object self, Object other) {
def diffs = []
if(self != other) {
diffs << [
'self': self.toString(),
'other': other.toString()
]
}
self.properties.inject(diffs) {list, entry ->
def key = "$entry.key"
if(self."$key" != other."$key") {
list << [
'self': [name: key, value: self."$key"],
'other': [name: key, value: other."$key"]
]
}
return list
}
}
static List diff(Collection self, Collection other) {
([self] + [other])
.transpose()
.inject([]) {list, pair ->
def diffs = diff(pair[0], pair[1])
if(diffs) list << diffs
return list
}
}
}
I couldn't get a working assert expression for the c-d comparison. The output looks like this:
[[[self:[sku:123, size:S], other:[sku:xyz, size:S]]]]
Explanation
The DiffMixin mixin implements comparison methods for the classes you want to compare. I only implemented two, Object and Collection, but it should give you the idea. The Object comparison compares toString() output and property values. The Collection comparison basically delegates most of the work, while collecting the diffs. These methods return a list describing the differences.
Think of this as a demo :)

Resources