Pivot table in AWK - multidimensional-array

I need to transform elements from an array to column index and return the value of $3 for each column index.
I donĀ“t have access to gawk 4 so I cannot work with real multidimensional arrays.
Input
Name^Code^Count
Name1^0029^1
Name1^0038^1
Name1^0053^1
Name2^0013^3
Name2^0018^3
Name2^0023^5
Name2^0025^1
Name2^0029^1
Name2^0038^1
Name2^0053^1
Name3^0018^1
Name3^0060^1
Name4^0018^2
Name4^0025^5
Name5^0018^2
Name5^0025^1
Name5^0060^1
Desired output
Name^0013^0018^0023^0025^0029^0038^0053^0060
Name1^^^^^1^1^1^
Name2^3^3^5^1^1^1^1^
Name3^^1^^^^^^1
Name4^^2^^5^^^^
Name5^^^^1^^^^1
Any suggestions on how to tackle this task without using real multidimensional arrays?

The following solution uses GNU awk v3.2 features for sorting. This does not use multi-dimensional arrays. It only simulates one.
awk -F"^" '
NR>1{
map[$1,$2] = $3
name[$1]++
value[$2]++
}
END{
printf "Name"
n = asorti(value, v_s)
for(i=1; i<=n; i++) {
printf "%s%s", FS, v_s[i]
}
print ""
m = asorti(name, n_s)
for(i=1; i<=m; i++) {
printf "%s", n_s[i]
for(j=1; j<=n; j++) {
printf "%s%s", FS, map[n_s[i],v_s[j]]
}
print ""
}
}' file
Name^0013^0018^0023^0025^0029^0038^0053^0060
Name1^^^^^1^1^1^
Name2^3^3^5^1^1^1^1^
Name3^^1^^^^^^1
Name4^^2^^5^^^^
Name5^^2^^1^^^^1

This will work with any awk and will order the output of counts numerically while keeping the names in the order they occur in your input file:
$ cat tst.awk
BEGIN{FS="^"}
NR>1 {
if (!seenNames[$1]++) {
names[++numNames] = $1
}
if (!seenCodes[$2]++) {
# Insertion Sort - start at the end of the existing array and
# move everything greater than the current value down one slot
# leaving open the slot for the current value to be inserted between
# the last value smaller than it and the first value greater than it.
for (j=++numCodes;codes[j-1]>$2+0;j--) {
codes[j] = codes[j-1]
}
codes[j] = $2
}
count[$1,$2] = $3
}
END {
printf "%s", "Name"
for (j=1;j<=numCodes;j++) {
printf "%s%s",FS,codes[j]
}
print ""
for (i=1;i<=numNames;i++) {
printf "%s", names[i]
for (j=1;j<=numCodes;j++) {
printf "%s%s",FS,count[names[i],codes[j]]
}
print ""
}
}
...
$ awk -f tst.awk file
Name^0013^0018^0023^0025^0029^0038^0053^0060
Name1^^^^^1^1^1^
Name2^3^3^5^1^1^1^1^
Name3^^1^^^^^^1
Name4^^2^^5^^^^
Name5^^2^^1^^^^1

Since you only have two "dimensions", it is easy enough to use one array for each dimension and a joining array with a calculated column name. I didn't do the sorting of columns or rows, but the idea is pretty basic.
#!/usr/bin/awk -f
#
BEGIN { FS = "^" }
(NR == 1) {next}
{
rows[$1] = 1
columns[$2] = 1
join_table[$1 "-" $2] = $3
}
END {
printf "Name"
for (col_name in columns) {
printf "^%s", col_name
}
printf "\n"
for (row_name in rows) {
printf row_name
for (col_name in columns) {
printf "^%s", join_table[row_name "-" col_name]
}
printf "\n"
}
}

Related

merging multiple rows into one row per record seprated by a blank line in unix [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 1 year ago.
Improve this question
I have a text file in which each record starts with a no and name and ends with a blank line. I would like to have per record in one row as comma-separated values. I have tried the following code whose code file and text file link attached below:
biosample.txt
sark.awk
unix command: to run the code is:
gawk -f sark.awk biosample.txt
then run:
sed 's/,,/\n/g' <biosample.txt > out.txt
but the out.txt is a bit discrepant/messy/confusing.
I want each record in one line with the values to be extracted for the following headers only:
record name
Identifiers
Organism
strain
isolate
serovar
isolation source
collected by
collection date
geographic location
host
host disease
Accession
ID
potential_contaminant
sample type
Description
Having the values for each header to be picked from each record that is separated by a new line.
Thanks
Here's a straightforward implementation with awk:
BEGIN { print "record name,Identifiers,Organism,strain,isolate,serovar,"\
"isolation source,collected by,collection date,"\
"geographic location,host,host disease,Accession,ID,"\
"potential_contaminant,sample type,Description"
RS="\r\n"
ORS=""
}
sub(/^[0-9]*: /,"") { r[1] = $0; next }
sub(/^Identifiers: /,""){ r[2] = $0; next }
sub(/^Organism: /,"") { r[3] = $0; next }
/^ / { split($0, a, "=") }
/^ *\/strain=/ { r[4] = a[2] }
/^ *\/isolate=/ { r[5] = a[2] }
/^ *\/serovar=/ { r[6] = a[2] }
/^ *\/isolation source=/{ r[7] = a[2] }
/^ *\/collected by=/ { r[8] = a[2] }
/^ *\/collection date=/ { r[9] = a[2] }
/^ *\/geographic locati/{ r[10] = a[2] }
/^ *\/host=/ { r[11] = a[2] }
/^ *\/host disease=/ { r[12] = a[2] }
/^Accession:/ { r[13] = $2; r[14] = $4 }
/^ *\/potential_contami/{ r[15] = a[2] }
/^ *\/sample type=/ { r[16] = a[2] }
/^Description:/ { getline; r[17] = $0 }
/^$/ { if (r[1]) { for (i = 1; i < 17; ++i) print r[i]","
print r[i]"\n"
delete r
}
}

How to split QString and keep the separator in Qt?

I have a QString: "{x, c | 0x01}", and I want to split it to 7 tokens as below:
{
x
,
c
|
0x01
}
What's the best way to do it in Qt?
I tried to use QString::split(QRegExp("[\\{\\},|]")), but it DOES NOT keep the separator in the result.
Maybe this solution can serve you task:
int main(void) {
QString str { "{x, c | 0x01}" };
QRegExp separators { "[\\{\\},|]" };
QStringList list;
str.replace( " ", "" );
int mem = 0;
for(int i = 0; i<str.size(); ++i) {
if(i == str.indexOf(separators, i)) {
if(mem) list.append(str.mid(mem, i-mem)); // append str before separator
list.append(str.mid(i, 1)); // append separator
mem = i+1;
}
}
qDebug() << list;
return 0;
}
Outputs: ("{", "x", ",", "c", "|", "0x01", "}")
You can eliminate if(mem) but then use list.pop_front(); orlist.removeAll(""); after the for cycle, as first element will be a rubbish "".
Basically, you iterate through the string, check if a deliminator is found, and add the deliminator to the list. If no deliminator is found, a new 'word' is added to the list, and until the next deliminator is found, characters will be added to the word. Take a look:
//input string
QString str = "{x, c | 0x01}";
QList<QString> out;
//flag used to keep track of whether we're adding a mullti-char word, or just a deliminator
bool insideWord = false;
//remove whitespaces
str = str.simplified();
str = str.replace(" ", "");
//iterate through string, check for delims, populate out list
for (int i = 0; i < str.length(); i++)
{
QChar c = str.at(i); //get char at current index
if (c == '{' || c == '}' || c == ',' || c == '|')
{
//append deliminator
out.append(c);
insideWord = false;
}
else
{
//append new word to qlist...
if (!insideWord)
{
out.append(c);
insideWord = true;
}
//but if word already started
else
{
//add 'c' to the word in last index of the qlist
out.last().append(c);
}
}
}
//output as requested by OP
qDebug() << "String is" << out;
This can be done in a single regular expression, but has to use look-ahead and look-behind.
The expression specified in the question ([\\{\\},|]) will match a 1-character long string consisting of any of the characters {, }, , or |. QString.split will then remove that 1-character long string.
What is needed is to find the zero-character string immediately before each of those separators, using a look-ahead: (?=[\\{\\},|]) and also to find the zero-character string immediately after the separator (?<=[\\{\\},|]).
Combining these gives:
QString::split(QRegularExpression("(?=[\\{\\},|])|(?<=[\\{\\},|])"))
Which will give the desired output: ("{", "x", ",", "c", "|", "0x01", "}")

Intermediate Representations using Javacc

I am trying to write the Intermediate Representations for expressions like:
a= 1+2-3*5/6
a= 1+2-3
a= 5/6+3
I am pretty new to JavaCC but I have basic knowledge about generating an AST using JJTree. I haven't made a separate class for IR, or implemented it with JJTree. I have written the grammar and tried to implement IR right in it.
The code is:
// E -> T (addop T)*
// T -> F (mulop F)*
// F -> intlit | "(" E ")"
SKIP :
{
" "
| "\t"
| "\n"
| "\r"
}
TOKEN :
{
< ADDOP :
"+"
| "-" >
| < MULOP :
"*"
| "/" >
| < INTLIT : ([ "0"-"9" ])+ >
}
String S() :
{
String s;
}
{
s = E()
{
return "a=" + s;
}
}
String E() :
{
String left, right;
Token op;
}
{
left = T()
(
op = < ADDOP > right = T()
{
left = ("t" + count++) + ": " + left + op.image + right + "\t";
}
)*
{
return left;
}
}
String T() :
{
String left, right;
Token op;
}
{
left = F()
(
op = < MULOP > right = F()
{
left = ("t" + count++) + ": " + left + op.image + right;
}
)*
{
return left;
}
}
String F() :
{
String s;
Token t;
}
{
t = < INTLIT >
{
return t.image;
}
}
My code works fine for Expressions like 1+2*3; or 1-2/4; where a particular operator is not repeated in an expression.
It would give a messy output for expressions where one out of + - or * / is repeating or + - or / *, both are included in expression. e.g. 1+2-4 (+ - both included), 1-2-3 (minus repeating) so on. (See attached picture for output)
My questions are:
How can I eliminate the above mentioned problems?
If my way of doing IR in Grammar file/class is not appropriate then what is a better way to perform IR in JavaCC?**
What I would do is to use a string buffer, output stream, or mutable list to accumulate the quadruples and return the value or intermediate representing the value of each nonterminal as a result.
E.g., for example
String E(StringBuffer buf) :
{
String left, right;
Token op;
}
{
left = T(buf)
(
op = < ADDOP > right = T(buf)
{
left = buildQuad( buf, left, op.image, right ) ;
}
)*
{
return left;
}
}
where buildQuad is defined as
String buildQuad( StringBuffer buf, String left, String op, String right )
{
String register = "t" + count++
buf.append( register + ": " + left + op + right + "\t" );
return register ;
}

Find duplicate words in two text files using command line

I have two text files:
f1.txt
boom Boom pow
Lazy dog runs.
The Grass is Green
This is TEST
Welcome
and
f2.txt
Welcome
I am lazy
Welcome, Green
This is my room
Welcome
bye
In Ubuntu Command Line I am trying:
awk 'BEGIN {RS=" "}FNR==NR {a[$1]=NR; next} $1 in a' f1.txt f2.txt
and getting output:
Green
This
is
My desired output is:
lazy
Green
This is
Welcome
Description: I want to compare two txt files, line by line. Then I want to output all duplicate words. The matches should be not case sensitive. Also, comparing line by line would be better instead of looking for a match from f1.txt in a whole f2.txt file. In example, the word "Welcome" should not be in desired output if it was on line 6 instead of line 5 in f2.txt
Well, then. With awk:
awk 'NR == FNR { for(i = 1; i <= NF; ++i) { a[NR,tolower($i)] = 1 }; next } { flag = 0; for(i = 1; i <= NF; ++i) { if(a[FNR,tolower($i)]) { printf("%s%s", flag ? OFS : "", $i); flag = 1 } } if(flag) print "" }' f1.txt f2.txt
This works as follows:
NR == FNR { # While processing the first file:
for(i = 1; i <= NF; ++i) { # Remember which fields were in
a[NR,tolower($i)] = 1 # each line (lower-cased)
}
next # Do nothing else.
}
{ # After that (when processing the
# second file)
flag = 0 # reset flag so we know we haven't
# printed anything yet
for(i = 1; i <= NF; ++i) { # wade through fields (words)
if(a[FNR,tolower($i)]) { # if this field was in the
# corresponding line in the first
# file, then
printf("%s%s", flag ? OFS : "", $i) # print it (with a separator if it
# isn't the first)
flag = 1 # raise flag
}
}
if(flag) { # and if we printed anything
print "" # add a newline at the end.
}
}

Where does the "newline" (\n) come from? (pattern matching using "flex")

I have an experimental flex source file(lex.l):
%option noyywrap
%{
int chars = 0;
int words = 0;
int lines = 0;
%}
delim [ \t\n]
ws {delim}+
letter [A-Za-z]
digit [0-9]
id {letter}({letter}|{digit})*
number {digit}+(.{digit}+)?(E[+-]?{digit}+)?
%%
{letter}+ { words++; chars += strlen(yytext); printf("Word\n"); }
\n { chars++; lines++; printf("Line\n"); }
. { chars++; printf("SomethingElse\n"); }
%%
int main(argc, argv)
int argc;
char **argv;
{
if(argc > 1)
{
if(!(yyin = fopen(argv[1], "r")))
{
perror(argv[1]);
return (1);
}
}
yylex();
printf("lines: %8d\nwords: %8d\nchars: %8d\n", lines, words, chars);
}
I created an input file called "input.txt" with "red apple" written in it. Command line:
$ flex lex.l
$ cc lex.yy.c
$ ./a.out < input.txt
Word
SomethingElse
Word
Line
lines: 1
words: 2
chars: 10
Since there is no newline character in the input file, why the "\n" in lex.l is pattern matched? (The "lines" is supposed to be 0, and the "chars" is supposed to be 9)
(I am using OS X.)
Thanks for your time.
It is very possible that your text editor has automatically inserted a newline at the end of the file.

Resources