I have a data set which looks like this
<SUBBEGIN
IMSI=xxxxxxxxxxxx;
MSISDN=xxxxxxxxx;
DEFCALL=TS11;
CURRENTNAM=BOTH;
CAT=COMMON;
VOLTE_TAG=NOT_DEFINED;
HLR_INDEX=1;
PS_MSISDNLESS_SUPPORTED=FALSE;
CS_MSISDNLESS_SUPPORTED=FALSE;
CSRATTYPE=NO-NO-NO-NO-NO;
PSRATTYPE=NO-NO-NO-NO-NO;
ICI=NO;
STE=NO;
<SUBEND
<SUBBEGIN
IMSI=xxxxxxxxxxxx;
MSISDN=xxxxxxxxx;
DEFCALL=TS11;
CURRENTNAM=BOTH;
VOLTE_TAG=NOT_DEFINED;
HLR_INDEX=1;
PS_MSISDNLESS_SUPPORTED=FALSE;
CS_MSISDNLESS_SUPPORTED=FALSE;
CSRATTYPE=NO-NO-NO-NO-NO;
<SUBEND
This is essentially one record and this is followed by multiple rows in the same format. I want the output to be in the format as:
IMSI|MSISDN|DEFCALL|CURRENTNAM|CAT...
xxxx|xxxx|TS11|BOTH|COMMON|COMMON
Any help is much appreciated.
$ cat tst.awk
BEGIN {FS="[=;]"; OFS="|" }
/^<SUB/ {
if (/END/) {
print (hdrPrinted++ ? "" : hdr ORS ) rec
hdr = rec = ""
}
next
}
{
sub(/^[[:space:]]+/,"")
hdr = (hdr=="" ? "" : hdr OFS) $1
rec = (rec=="" ? "" : rec OFS) $2
}
$ awk -f tst.awk file
IMSI|MSISDN|DEFCALL|CURRENTNAM|CAT|VOLTE_TAG|HLR_INDEX|PS_MSISDNLESS_SUPPORTED|CS_MSISDNLESS_SUPPORTED|CSRATTYPE|PSRATTYPE|ICI|STE
xxxxxxxxxxxx|xxxxxxxxx|TS11|BOTH|COMMON|NOT_DEFINED|1|FALSE|FALSE|NO-NO-NO-NO-NO|NO-NO-NO-NO-NO|NO|NO
$ cat test.txt
/<SUBBEGIN/ {f=1; next} # at start flag up
/<SUBEND/ { # at end
print b ORS c # print
f=0; b=c="" # flag up and reset variables
}
f { # between markers
split($1,a,"[=;]") # gather to 2 variables
b=b a[1] "|"
c=c a[2] "|"
}
Test it:
$ awk -f test.awk test.txt
IMSI|MSISDN|DEFCALL|CURRENTNAM|CAT|VOLTE_TAG|HLR_INDEX|PS_MSISDNLESS_SUPPORTED|CS_MSISDNLESS_SUPPORTED|CSRATTYPE|PSRATTYPE|ICI|STE|
xxxxxxxxxxxx|xxxxxxxxx|TS11|BOTH|COMMON|NOT_DEFINED|1|FALSE|FALSE|NO-NO-NO-NO-NO|NO-NO-NO-NO-NO|NO|NO|
Workaround using tr
tr -s '\n' ',' < file > tmpfile;
This gives me the output in form of
<SUBBEGIN IMSI=xxxxxxxxxxxx; MSISDN=xxxxxxxxx; DEFCALL=TS11; CURRENTNAM=BOTH; CAT=COMMON; VOLTE_TAG=NOT_DEFINED; HLR_INDEX=1; PS_MSISDNLESS_SUPPORTED=FALSE; CS_MSISDNLESS_SUPPORTED=FALSE; CSRATTYPE=NO-NO-NO-NO-NO; PSRATTYPE=NO-NO-NO-NO-NO; ICI=NO; STE=NO; <SUBEND
Replace the string "<SUBBEGIN" with \n
Start with this one:
sed '/<SUBBEGIN/{:a;N;/\<SUBEND/!ba;s/\n[^=]*=/ /g;s/.*SUBBEGIN//;s/;/|/g}' input
Here is an another solution:
awk script
#!/bin/awk
function print_record( hdr )
{
str = ""
for( i = 1; i <= 13; i++ )
{
if( hdr )
{
value = substr( $i, 1, index( $i, "=" ) - 1 )
}
else
{
value = substr( $i, index( $i, "=" ) + 1 )
}
gsub( /^[ \t]+/, "", value )
if( length(str) > 0 )
str = str OFS
str = str value
}
print str
}
BEGIN {
RS="<SUBBEGIN\n"
FS=";\n"
hdr=1
OFS="|"
}
{
if( index( $0, "=" ) && index( $0, ";" ) )
{
if( hdr )
{
print_record( 1 )
hdr = 0;
}
print_record( 0 )
}
}
# eof #
Input file
<SUBBEGIN
IMSI=xxxxxxxxxxxx;
MSISDN=xxxxxxxxx;
DEFCALL=TS11;
CURRENTNAM=BOTH;
CAT=COMMON;
VOLTE_TAG=NOT_DEFINED;
HLR_INDEX=1;
PS_MSISDNLESS_SUPPORTED=FALSE;
CS_MSISDNLESS_SUPPORTED=FALSE;
CSRATTYPE=NO-NO-NO-NO-NO;
PSRATTYPE=NO-NO-NO-NO-NO;
ICI=NO;
STE=NO;
<SUBEND
<SUBBEGIN
IMSI=yyyyyyyyyy;
MSISDN=yyyyyyyyy;
DEFCALL=TS11;
CURRENTNAM=BOTH;
CAT=COMMON;
VOLTE_TAG=NOT_DEFINED;
HLR_INDEX=2;
PS_MSISDNLESS_SUPPORTED=TRUE;
CS_MSISDNLESS_SUPPORTED=FALSE;
CSRATTYPE=NO-YES-NO-NO-NO;
PSRATTYPE=NO-NO-NO-YES-NO;
ICI=NO;
STE=NO;
<SUBEND
<SUBBEGIN
IMSI=zzzzzzzzzz;
MSISDN=zzzzzzzzzzzzzzz;
DEFCALL=TS11;
CURRENTNAM=BOTH;
CAT=COMMON;
VOLTE_TAG=NOT_DEFINED;
HLR_INDEX=3;
PS_MSISDNLESS_SUPPORTED=FALSE;
CS_MSISDNLESS_SUPPORTED=TRUE;
CSRATTYPE=NO-YES-YES-NO-NO;
PSRATTYPE=NO-NO-YES-YES-NO;
ICI=YES;
STE=YES;
<SUBEND
Output
$ awk -f script.awk -- input.txt
IMSI|MSISDN|DEFCALL|CURRENTNAM|CAT|VOLTE_TAG|HLR_INDEX|PS_MSISDNLESS_SUPPORTED|CS_MSISDNLESS_SUPPORTED|CSRATTYPE|PSRATTYPE|ICI|STE
xxxxxxxxxxxx|xxxxxxxxx|TS11|BOTH|COMMON|NOT_DEFINED|1|FALSE|FALSE|NO-NO-NO-NO-NO|NO-NO-NO-NO-NO|NO|NO
yyyyyyyyyy|yyyyyyyyy|TS11|BOTH|COMMON|NOT_DEFINED|2|TRUE|FALSE|NO-YES-NO-NO-NO|NO-NO-NO-YES-NO|NO|NO
zzzzzzzzzz|zzzzzzzzzzzzzzz|TS11|BOTH|COMMON|NOT_DEFINED|3|FALSE|TRUE|NO-YES-YES-NO-NO|NO-NO-YES-YES-NO|YES|YES
Hope It Helps!
with unix toolchain, perhaps the shortest...
$ sed '/^</d' file | tr '=' '\n' | tr -d ' ;' | pr -13ts'|'
IMSI|MSISDN|DEFCALL|CURRENTNAM|CAT|VOLTE_TAG|HLR_INDEX|PS_MSISDNLESS_SUPPORTED|CS_MSISDNLESS_SUPPORTED|CSRATTYPE|PSRATTYPE|ICI|STE
xxxxxxxxxxxx|xxxxxxxxx|TS11|BOTH|COMMON|NOT_DEFINED|1|FALSE|FALSE|NO-NO-NO-NO-NO|NO-NO-NO-NO-NO|NO|NO
Using a different input file for simplicity
$ cat ip.txt
<SUBBEGIN
i1=abc;
i2=ijk;
i3=xyz;
k1=NO;
t1=YES;
<SUBEND
<SUBBEGIN
i1=foo;
i2=bar;
i3=test;
k1=YES;
t1=NO;
<SUBEND
$ perl -nle '
$s=/<SUBBEGIN/ if /<SUB/;
if($s && !/<SUB/)
{
($k,$v) = /\S+(?==)|=\K[^;]+/g;
push(#key, $k);
push(#val, $v);
}
elsif(#key)
{
print join "|", #key;
print join "|", #val;
#key = ();
#val = ();
}
' ip.txt
i1|i2|i3|k1|t1
abc|ijk|xyz|NO|YES
i1|i2|i3|k1|t1
foo|bar|test|YES|NO
$s set flag if input line contains <SUBBEGIN
If flag is set and input line doesn't contain <SUB
extract key, value pair
populate them in two different arrays
Once input line contains <SUB again
Check if one of the array (say #key) is not empty
print the key value arrays with | as separator
empty the arrays
This will work whether there are empty lines or not between data structures
Related
Hi I have no idea if this is a legitimate question for SO.
But I have a script i made for sorting photos into folders based on tags. It uses the exifr package to do this.
However, it's running very slow. I've tried to improve it using guides but what I ever I make ends up not working. Is there someone with understanding of vectorization and or optimization that could give point to some suggestions.
Thanks!
#----- Imports ----
library(exifr)
# ---------- Functions ----------
'%!in%' <- function(x,y)!('%in%'(x,y))
tagcatcher <- function(dat){
tags <- c()
for (tagNameTry in keywords_names ) {
if (tagNameTry %in% names(dat)) {
xs <- dat[tagNameTry]
if (typeof(xs) == "list") {
xs <- xs[[1]]
l <- length(xs[[1]])
x <- c()
for (i in 1:l) {
x <- c(x,xs[[1]][i])
}
} else {
x <- xs
}
tags <- c(tags,x)
}
}
tags <- unique(tags)
return(tags)
}
# ----------- Settings ----------
ss <- "/"
haystacks <- c("H:MyPhotos")
organizedMediaPhotos <- "V:/Photos"
all_files <- list.files(haystacks,recursive = TRUE, full.names = TRUE)
keywords_names <- c("Category","XPKeywords","Keywords")
ctags <- list.dirs(organizedMediaPhotos)[list.dirs(organizedMediaPhotos) %!in% organizedMediaPhotos]
current_tags <- c()
for (ctag in ctags) {
x <- strsplit(ctag,"/")
x <- x[[1]]
x <- x[length(x)]
current_tags <- c(current_tags,x)
}
# Main Loop - That Needs to be faster
for (cur_file in all_files) {
print(cur_file)
cur_dat <- read_exif(cur_file,tags=keywords_names)
tags <- tagcatcher(cur_dat)
for (tag in tags) {
tag_folder <- paste(organizedMediaPhotos,ss,tag,sep="")
if (tag %!in% current_tags) {
dir.create(tag_folder)
print(paste("creating tag folder: ",tag_folder))
}
pic_path <- paste(tag_folder,ss,basename(cur_file),sep="")
if (!file.exists(pic_path)) {
file.copy(cur_file,pic_path)
print(paste("moved file from ",cur_file, " to ", pic_path))
}
}
}
You can give this a try
for x in *.jpg; do
d=$(date -r "$x" +%Y-%m-%d)
mkdir -p "$d"
mv -- "$x" "$d/"
done
For powershell:
Param(
[string]$source,
[string]$dest,
[string]$format = "yyyy/yyyy_MM/yyyy_MM_dd"
)
$shell = New-Object -ComObject Shell.Application
function Get-File-Date {
[CmdletBinding()]
Param (
$object
)
$dir = $shell.NameSpace( $object.Directory.FullName )
$file = $dir.ParseName( $object.Name )
# First see if we have Date Taken, which is at index 12
$date = Get-Date-Property-Value $dir $file 12
if ($null -eq $date) {
# If we don't have Date Taken, then find the oldest date from all date properties
0..287 | ForEach-Object {
$name = $dir.GetDetailsof($dir.items, $_)
if ( $name -match '(date)|(created)') {
# Only get value if date field because the GetDetailsOf call is expensive
$tmp = Get-Date-Property-Value $dir $file $_
if ( ($null -ne $tmp) -and (($null -eq $date) -or ($tmp -lt $date))) {
$date = $tmp
}
}
}
}
return $date
}
function Get-Date-Property-Value {
[CmdletBinding()]
Param (
$dir,
$file,
$index
)
$value = ($dir.GetDetailsof($file, $index) -replace "`u{200e}") -replace "`u{200f}"
if ($value -and $value -ne '') {
return [DateTime]::ParseExact($value, "g", $null)
}
return $null
}
Get-ChildItem -Attributes !Directory $source -Recurse |
Foreach-Object {
Write-Host "Processing $_"
$date = Get-File-Date $_
if ($date) {
$destinationFolder = Get-Date -Date $date -Format $format
$destinationPath = Join-Path -Path $dest -ChildPath $destinationFolder
# See if the destination file exists and rename until we get a unique name
$newFullName = Join-Path -Path $destinationPath -ChildPath $_.Name
if ($_.FullName -eq $newFullName) {
Write-Host "Skipping: Source file and destination files are at the same location. $_"
return
}
$newNameIndex = 1
$newName = $_.Name
while (Test-Path -Path $newFullName) {
$newName = ($_.BaseName + "_$newNameIndex" + $_.Extension)
$newFullName = Join-Path -Path $destinationPath -ChildPath $newName
$newNameIndex += 1
}
# If we have a new name, then we need to rename in current location before moving it.
if ($newNameIndex -gt 1) {
Rename-Item -Path $_.FullName -NewName $newName
}
Write-Host "Moving $_ to $newFullName"
# Create the destination directory if it doesn't exist
if (!(Test-Path $destinationPath)) {
New-Item -ItemType Directory -Force -Path $destinationPath
}
robocopy $_.DirectoryName $destinationPath $newName /mo
PS: I had tried this a few years back and it worked like a charm
You can change your if command to something like this:
if [[ "$t" =~ IMG_+[0-9]{8}[a-zA-Z]*$ ]]
The =~ is a regular expression comparison operator which is introduced in bash version 3 and above.
By using this if statement you can catch names like IMG_11111111alphabets.ext. You can play with it and customize it according to your needs. For more information have a look at this: Bash's regular expression
How jq filter combines the filter outputs? Following jq not generates output.json with respective input arg value ('jack').
input.json
{
"key1": "",
"key2": ""
}
jq --arg input "$username" \
'if .key1 == "<value1>"
then . + {"key1" : ($input) }
else . end' input.json |
'if .key2 == "<value2>"
then . + {"key2" : ($input) }
else . end' > output.json
output.json
{
"key1": "jack",
"key2": "jack"
}
The filter you are evidently trying to write is:
if .key1 == "" then . + {"key1" : $input } else . end
| if .key2 == "" then . + {"key2" : $input } else . end
This can be simplified to:
if .key1 == "" then .key1 = $input else . end
| if .key2 == "" then .key2 = $input else . end
You might also like to consider the following approach:
def update(f): f |= (if . == "" then $input else . end);
update(.key1) | update(.key2)
I am comparing two files using unix Awk
col_1,col_2,col_3
1,2,4
1,3,6
col_1,col_3,col2,col_5,col_6
1,2,3,4,5
1,6,3,,,
Below is the code i am using
awk '
NR == FNR {if (NR == 1) for (MX=n=NF; n>0; n--) REF[$n] = n
else TMP[NR] = $0
next
}
FNR == 1 {for (n=NF; n>0; n--) {if ($n in REF) CMP[n]=REF[$n]
if ($n == SRCH) NSR = n
HD[n] = $n
NL = "Null"
}
next
}
{n = split (TMP[FNR], IT)
EQU = 1
for (i=1; i<=MX; i++) {T = IT[CMP[i]]
if ($i != T) {print SRCH, $NSR ": mismatch at", HD[i] ":", $i?$i:NL, "-", T?T:NL
EQU = 0
}
}
if (EQU) print SRCH, $NSR, "doesn´t have any mismatch."
}
' FS="," SRCH="col_1" file2 file1 # comparison files
I need to replace the mismatch records based on file2, can some one please guide
awk to the rescue!
$ awk -F, -v OFS=, 'NR==1 {n1=split($0,cols1);h=$0;next}
NR==FNR {a[$1]; next}
FNR==1 {print h; n2=split($0,cols2);
for(i=2;i<=n2;i++) cols[cols2[i]]=i; next}
$1 in a {for(i=1;i<=n1;i++)
printf "%s%s",
$(cols[cols1[i]]),(i==n1)?ORS:OFS}' file1 file2
col_1,col_2,col_3
1,3,2
1,3,6
your second file column header has a typo.
Would like to extract the line items, if the date range between 25-mar-2015 to 05-may-2015 from second field ($2) .
Date column is not sorted and each files contain millions of records.
Inputs.gz
Des,DateInfo,Amt,Loc,Des2
abc,02-dec-2014,10,def,xyz
abc,20-apr-2015,25,def,xyz
abc,14-apr-2015,40,def,xyz
abc,17-mar-2014,55,def,xyz
abc,24-nov-2011,70,def,xyz
abc,13-may-2015,85,def,xyz
abc,30-sep-2008,100,def,xyz
abc,20-jan-2014,115,def,xyz
abc,04-may-2015,130,def,xyz
abc,25-nov-2013,145,def,xyz
abc,29-mar-2015,55,def,xyz
I have tried like below command and in-complete :
function getDate(date) {
split(date, a, "-");
return mktime(a[3] " " sprintf("%02i",(index("janfebmaraprmayjunjulaugsepoctnovdec", a[2])+2)/3) " " a[1] " 00 00 00")
}
BEGIN {FS=","}
{ if ( getDate($2)>=getDate(25-mar-2015) && getDate($2)<=getDate(05-may-2015) ) print $0 }
Expected Output:
abc,20-apr-2015,25,def,xyz
abc,14-apr-2015,40,def,xyz
abc,04-may-2015,130,def,xyz
abc,29-mar-2015,55,def,xyz
Please suggest ... I dont have perl & python access.
$ cat tst.awk
function getDate(date, a) {
split(date, a, /-/)
return mktime(a[3]" "(index("janfebmaraprmayjunjulaugsepoctnovdec",a[2])+2)/3" "a[1]" 0 0 0")
}
BEGIN {
FS=","
beg = getDate("25-mar-2015")
end = getDate("05-may-2015")
}
{ cur = getDate($2) }
NR>1 && cur>=beg && cur<=end
$ awk -f tst.awk file
abc,20-apr-2015,25,def,xyz
abc,14-apr-2015,40,def,xyz
abc,04-may-2015,130,def,xyz
abc,29-mar-2015,55,def,xyz
I have a left recursive issue in my Antlr grammar. While I think I understand why there is a problem I am unable to think of a solution. The issue is with the last line for my datatype rule. I have included the entire grammar for you to see:
grammar Test;
options {output=AST;ASTLabelType=CommonTree;}
tokens {FUNCTION; ATTRIBUTES; CHILDREN; COMPOSITE;}
program : function ;
function : ID (OPEN_BRACKET (attribute (COMMA? attribute)*)? CLOSE_BRACKET)? (OPEN_BRACE function* CLOSE_BRACE)? SEMICOLON? -> ^(FUNCTION ID ^(ATTRIBUTES attribute*) ^(CHILDREN function*)) ;
attribute : ID (COLON | EQUALS) datatype -> ^(ID datatype);
datatype : ID -> ^(STRING["id"] ID)
| NUMBER -> ^(STRING["number"] NUMBER)
| STRING -> ^(STRING["string"] STRING)
| BOOLEAN -> ^(STRING["boolean"] BOOLEAN)
| array -> ^(STRING["array"] array)
| lookup -> ^(STRING["lookup"] lookup)
| datatype PLUS datatype -> ^(COMPOSITE datatype datatype) ;
array : OPEN_BOX (datatype (COMMA datatype)*)? CLOSE_BOX -> datatype* ;
lookup : OPEN_BRACE (ID (PERIOD ID)*) CLOSE_BRACE -> ID* ;
NUMBER
: ('+' | '-')? (INTEGER | FLOAT)
;
STRING
: '"' ( ESC_SEQ | ~('\\'|'"') )* '"'
;
BOOLEAN
: 'true' | 'TRUE' | 'false' | 'FALSE'
;
ID : (LETTER|'_') (LETTER | INTEGER |'_')*
;
COMMENT
: '//' ~('\n'|'\r')* '\r'? '\n' {$channel=HIDDEN;}
| '/*' ( options {greedy=false;} : . )* '*/' {$channel=HIDDEN;}
;
WHITESPACE : (' ' | '\t' | '\r' | '\n') {$channel=HIDDEN;} ;
COLON : ':' ;
SEMICOLON : ';' ;
COMMA : ',' ;
PERIOD : '.' ;
PLUS : '+' ;
EQUALS : '=' ;
OPEN_BRACKET : '(' ;
CLOSE_BRACKET : ')' ;
OPEN_BRACE : '{' ;
CLOSE_BRACE : '}' ;
OPEN_BOX : '[' ;
CLOSE_BOX : ']' ;
fragment
LETTER
: 'a'..'z' | 'A'..'Z'
;
fragment
INTEGER
: '0'..'9'+
;
fragment
FLOAT
: INTEGER+ '.' INTEGER*
;
fragment
ESC_SEQ
: '\\' ('b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\')
;
I am developing using Antlr works which provides a function to resolve this issue - but unfortunately it does not seem to work :s
Any help would be great.
Thanks.
EDIT:
Here is an example of the language I'm trying to implement / parse
<FunctionName> <OptionalAttributes> <OptionalChildFunctions>
So for example:
ForEach(in:[1,2,3,4,5] as:"i") {
Switch(value:{i}) {
Case(value:3) {
Print(message:"This is the number 3")
}
Default {
Print(message:"This isn't the number 3")
}
}
}
Okay, this should do the trick:
grammar Test;
/************************************** PARSER **************************************/
program
: function EOF
;
function
: ID (OPEN_PAREN (attribute (COMMA attribute)*)? CLOSE_PAREN)?
(OPEN_BRACE function* CLOSE_BRACE)?
SEMICOLON?
;
attribute
: ID (COLON | EQUALS)? expression
;
expression
: atom (PLUS atom)*
;
atom
: ID
| STRING
| BOOLEAN
| NUMBER
| array
| lookup
;
array
: OPEN_BOX (expression (COMMA expression)*)? CLOSE_BOX
;
lookup
: OPEN_BRACE (ID (PERIOD ID)*) CLOSE_BRACE
;
/************************************** LEXER **************************************/
NUMBER : ('+' | '-')? (INTEGER | FLOAT)
;
STRING : '"' ( ESC_SEQ | ~('\\'|'"') )* '"'
;
BOOLEAN : 'true' | 'TRUE' | 'false' | 'FALSE'
;
ID : (LETTER|'_') (LETTER | INTEGER |'_')*
;
COMMENT : '//' ~('\n'|'\r')* ('\r'? '\n'| EOF) {$channel=HIDDEN;}
| '/*' ( options {greedy=false;} : . )* '*/' {$channel=HIDDEN;}
;
WHITESPACE : (' ' | '\t' | '\r' | '\n') {$channel=HIDDEN;} ;
COLON : ':' ;
SEMICOLON : ';' ;
COMMA : ',' ;
PERIOD : '.' ;
PLUS : '+' ;
EQUALS : '=' ;
OPEN_PAREN : '(' ;
CLOSE_PAREN : ')' ;
OPEN_BRACE : '{' ;
CLOSE_BRACE : '}' ;
OPEN_BOX : '[' ;
CLOSE_BOX : ']' ;
fragment
LETTER : 'a'..'z' | 'A'..'Z' ;
fragment
INTEGER : '0'..'9'+ ;
fragment
FLOAT : INTEGER+ '.' INTEGER* ;
fragment
ESC_SEQ : '\\' ('b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\') ;
Note that I've changed the name of OPEN_BRACKET and CLOSE_BRACKET into OPEN_PAREN and CLOSE_PAREN. The round ones, ( and ), are parenthesis, the square ones, [ and ], are called brackets (the ones you called boxes, but calling them boxes doesn't hurt IMO).