awk sort count values sum duplicates and transpose columns - sorting

i am playing with this to get output for more easy readable format as trying to get based on uniq column nr4 and nr2 where OK is status and X is "KO" status ,
which belongs to column nr1 where is not all the time and even rule to have OK status for F1 F2 .. values in nr.3 .
starting to work with this input:
input:
NR5X1 OK F1 SEAT5
NR5X1 OK F2 SEAT5
NR5X1 X F3 SEAT5
NR5X1 X F4 SEAT5
NR5X1 X F5 SEAT5
NR5X1 X F6 SEAT5
NR5X1 X F7 SEAT5
NR5X1 X F8 SEAT5
NR5X2 OK F1 SEAT5
NR5X2 OK F2 SEAT5
NR5X2 X F3 SEAT5
NR5X2 X F4 SEAT5
NR5X2 X F5 SEAT5
NR5X2 X F6 SEAT5
NR5X2 X F7 SEAT5
NR5X2 X F8 SEAT5
NR5X3 OK F5 FLEET
NR5X3 OK F6 FLEET
NR5X5 OK F5 FLEET
NR5X5 OK F6 FLEET
NR5X7 F5 X ROME
NR5X7 F6 X ROME
NR5X8 F5 X ROME
NR5X8 F6 OK ROME
and trying to achieve this output
2 OK of 8 | SEAT5 NR5X1 OK F1 OK F2 X F3 X F4 X F5 X F6 X F7 X F8
2 OK of 8 | SEAT5 NR5X2 OK F1 OK F2 X F3 X F4 X F5 X F6 X F7 X F8
2 OK of 2 | FLEET NR5X3 OK F5 OK F6
2 OK of 2 | FLEET NR5X5 OK F5 OK F6
2 X of 2 | ROME NR5X7 X F5 X F6
1 OK of 2 | ROME NR5X8 X F5 OK F6
partially moved with this code:
cat file | awk '!seen[$1]++ {print $4,$1} {print $3,$2}' ,xargs on multiple names will join all on one line so i stucked here
ROME NR5X7
X F5
X F6
ROME NR5X8
X F5
OK F6
not sure if there is other easy method how to pretify output but thats last thing which is needed in fact
any suggestions more then welcomed , good day

$ cat tst.awk
$1 != prev {
if (NR > 1) {
prt()
}
prev = $1
}
{
key = $1
type = $4
if (type == "ROME") {
stat = $3
fval = $2
}
else {
stat = $2
fval = $3
}
statCnt[stat]++
totCnt++
pairs = pairs OFS sprintf("%-2s %-2s",stat,fval)
}
END { prt() }
function prt( stat) {
stat = ("OK" in statCnt ? "OK" : "X")
printf "%d %-2s of %d | %-5s %s%s\n", statCnt[stat], stat, totCnt, type, key, pairs
delete statCnt
totCnt = 0
pairs = ""
}
.
$ awk -f tst.awk file
2 OK of 8 | SEAT5 NR5X1 OK F1 OK F2 X F3 X F4 X F5 X F6 X F7 X F8
2 OK of 8 | SEAT5 NR5X2 OK F1 OK F2 X F3 X F4 X F5 X F6 X F7 X F8
2 OK of 2 | FLEET NR5X3 OK F5 OK F6
2 OK of 2 | FLEET NR5X5 OK F5 OK F6
2 X of 2 | ROME NR5X7 X F5 X F6
1 OK of 2 | ROME NR5X8 X F5 OK F6

Try this Perl solution
$ seats.ksh ya801.txt
2 OK of 2 | FLEET NR5X3 OK F5 OK F6
1 OK of 2 | ROME NR5X8 X F5 OK F6
2 OK of 2 | FLEET NR5X5 OK F5 OK F6
2 OK of 8 | SEAT5 NR5X1 OK F1 OK F2 X F3 X F4 X F5 X F6 X F7 X F8
2 X of 2 | ROME NR5X7 X F5 X F6
2 OK of 8 | SEAT5 NR5X2 OK F1 OK F2 X F3 X F4 X F5 X F6 X F7 X F8
Script:
$ cat seats.ksh
perl -lane '
$x="$F[3] $F[0]";
$kv{$x}++;
#t=#{$kv2{$x}};
push(#t,"$F[1] $F[2]");
$kv2{$x}=[#t];
#tok=#{$kvok{$x}};
if ( $F[1] eq "OK" ) { push(#tok,$F[1]); $kvok{$x}=[#tok] }
#tx=#{$kvx{$x}};
if ( $F[1] eq "X" ) { push(#tx,$F[1]); $kvx{$x}=[#tx] }
END
{
foreach $p (keys %kv)
{
#oks=#{$kvok{$p}};
#xs=#{$kvx{$p}};
if( scalar #oks ) { $okcount=sprintf("%d OK of %d | ",scalar #oks, $kv{$p}) }
else { $okcount=sprintf("%d X of %d | ",scalar #xs, $kv{$p}) }
print $okcount, "$p ", join(" ",#{$kv2{$p}})
}
}
' $1
$
Note: It looks like columns 2 and 3 are swapped in the last 4 lines.. so I changed them
$ cat ya801.txt
NR5X1 OK F1 SEAT5
NR5X1 OK F2 SEAT5
NR5X1 X F3 SEAT5
NR5X1 X F4 SEAT5
NR5X1 X F5 SEAT5
NR5X1 X F6 SEAT5
NR5X1 X F7 SEAT5
NR5X1 X F8 SEAT5
NR5X2 OK F1 SEAT5
NR5X2 OK F2 SEAT5
NR5X2 X F3 SEAT5
NR5X2 X F4 SEAT5
NR5X2 X F5 SEAT5
NR5X2 X F6 SEAT5
NR5X2 X F7 SEAT5
NR5X2 X F8 SEAT5
NR5X3 OK F5 FLEET
NR5X3 OK F6 FLEET
NR5X5 OK F5 FLEET
NR5X5 OK F6 FLEET
NR5X7 X F5 ROME
NR5X7 X F6 ROME
NR5X8 X F5 ROME
NR5X8 OK F6 ROME
$
Explanation:
perl -lane '
$x="$F[3] $F[0]"; # Capture 4 & 1 col together
$kv{$x}++; # hash "kv" for the overall count i.e the 8 in "2 OK of 8"
#t=#{$kv2{$x}}; # dereference the array in #t i.e 2nd and 3rd column together
push(#t,"$F[1] $F[2]"); # append to the array with new value if 2 & 3 col together
$kv2{$x}=[#t]; # reassign it bact to hash "kv2"
#tok=#{$kvok{$x}}; # dereference the array in #tok for capturing OK
if ( $F[1] eq "OK" ) { push(#tok,$F[1]); $kvok{$x}=[#tok] } # append only if "OK" is present in the 2nd column => uses hash kvok
#tx=#{$kvx{$x}}; # dereference the array in #tx for capturing X
if ( $F[1] eq "X" ) { push(#tx,$F[1]); $kvx{$x}=[#tx] } # append only if "X" is present in the 2nd column => uses hash kvx
END
{
foreach $p (keys %kv) # loop through the kv hash
{
#oks=#{$kvok{$p}}; # dereference and get the OK array from kvok hash
#xs=#{$kvx{$p}}; # dereference and get the X array from kvx hash
if( scalar #oks ) { $okcount=sprintf("%d OK of %d | ",scalar #oks, $kv{$p}) } # scalar #oks gives the array size. Only if the OK count is > 1 then print like "2 OK of 8"
else { $okcount=sprintf("%d X of %d | ",scalar #xs, $kv{$p}) } # false then ise the X array
print $okcount, "$p ", join(" ",#{$kv2{$p}}) # print all of them together hash kv2 contains the arrays of 2&3 col together
}
}
' $1

Related

Simple conditional unexpectedly fails in bash if statement

I've spent an embarrassingly long time trying to understand why the second conditional in the "foo" script below fails but the first one succeeds.
Please note:
The current directory contains two files: bar and foo.
All three strings $s1, $s2 and $s3 are equal according to hexdump.
Thanks in advance for any help.
Session: (Running on a Centos7 host):
>ls
bar foo
>cat foo
#!/bin/bash
s1="bar foo"
s2="bar foo"
s3=`ls`
echo -n $s1 | hexdump -C
echo -n $s2 | hexdump -C
echo -n $s3 | hexdump -C
if [ "$s1" = "$s2" ]; then # True
echo s1 = s2
fi
if [ "$s1" = "$s3" ]; then # NOT true! Why?
echo s1 = s3
fi
>foo
00000000 62 61 72 20 66 6f 6f |bar foo|
00000007
00000000 62 61 72 20 66 6f 6f |bar foo|
00000007
00000000 62 61 72 20 66 6f 6f |bar foo|
00000007
s1 = s2
>
Quote the variables when echoing.
echo -n "$s3" | hexdump -C
You'll see a newline between the file names, as ls uses -1 when the output is redirected.
Your demo would be more convincing with echo -n "$s1" etc. That would show that there's a newline in the middle of s3 where there's a space in s1 and s2. The echo without the double quotes mangles the newline into a space (and generally each sequence of one or more white space characters in the string into a single space).
Given:
#!/bin/bash
s1="bar foo"
s2="bar foo"
s3=`ls`
echo -n "$s1" | hexdump -C
echo -n "$s2" | hexdump -C
echo -n "$s3" | hexdump -C
if [ "$s1" = "$s2" ]; then # True
echo s1 = s2
fi
if [ "$s1" = "$s3" ]; then # NOT true because s3 contains a newline!
echo s1 = s3
fi
I get:
$ sh foo
00000000 2d 6e 20 62 61 72 20 66 6f 6f 0a |-n bar foo.|
0000000b
00000000 2d 6e 20 62 61 72 20 66 6f 6f 0a |-n bar foo.|
0000000b
00000000 2d 6e 20 62 61 72 0a 66 6f 6f 0a |-n bar.foo.|
0000000b
s1 = s2
$ bash foo
00000000 62 61 72 20 66 6f 6f |bar foo|
00000007
00000000 62 61 72 20 66 6f 6f |bar foo|
00000007
00000000 62 61 72 0a 66 6f 6f |bar.foo|
00000007
s1 = s2
$

transpose lines to columns [duplicate]

i am trying to transpose a table (10k rows X 10K cols) using the following script.
A simple data example
$ cat rm1
t1 t2 t3
n1 1 2 3
n2 2 3 44
n3 1 1 1
$ sh transpose.sh rm1
n1 n2 n3
t1 1 2 1
t2 2 3 1
t3 3 44 1
However, I am getting memory error. Any help would be appreciated.
awk -F "\t" '{
for (f = 1; f <= NF; f++)
a[NR, f] = $f
}
NF > nf { nf = NF }
END {
for (f = 1; f <= nf; f++)
for (r = 1; r <= NR; r++)
printf a[r, f] (r==NR ? RS : FS)
}'
Error
awk: cmd. line:2: (FILENAME=input FNR=12658) fatal: dupnode: r->stptr: can't allocate 10 bytes of memory (Cannot allocate memory)
Here's one way to do it, as I mentioned in my comments, in chunks. Here I show the mechanics on a tiny 12r x 10c file, but I also ran a chunk of 1000 rows on a 10K x 10K file in not much more than a minute (Mac Powerbook).6
EDIT The following was updated to consider an M x N matrix with unequal number of rows and columns. The previous version only worked for an 'N x N' matrix.
$ cat et.awk
BEGIN {
start = chunk_start
limit = chunk_start + chunk_size - 1
}
{
n = (limit > NF) ? NF : limit
for (f = start; f <= n; f++) {
a[NR, f] = $f
}
}
END {
n = (limit > NF) ? NF : limit
for (f = start; f <= n; f++)
for (r = 1; r <= NR; r++)
printf a[r, f] (r==NR ? RS : FS)
}
$ cat t.txt
10 11 12 13 14 15 16 17 18 19
20 21 22 23 24 25 26 27 28 29
30 31 32 33 34 35 36 37 38 39
40 41 42 43 44 45 46 47 48 49
50 51 52 53 54 55 56 57 58 59
60 61 62 63 64 65 66 67 68 69
70 71 72 73 74 75 76 77 78 79
80 81 82 83 84 85 86 87 88 89
90 91 92 93 94 95 96 97 98 99
A0 A1 A2 A3 A4 A5 A6 A7 A8 A9
B0 B1 B2 B3 B4 B5 B6 B7 B8 B9
C0 C1 C2 C3 C4 C5 C6 C7 C8 C9
$ cat et.sh
inf=$1
outf=$2
rm -f $outf
for i in $(seq 1 2 12); do
echo chunk for rows $i $(expr $i + 1)
awk -v chunk_start=$i -v chunk_size=2 -f et.awk $inf >> $outf
done
$ sh et.sh t.txt t-transpose.txt
chunk for rows 1 2
chunk for rows 3 4
chunk for rows 5 6
chunk for rows 7 8
chunk for rows 9 10
chunk for rows 11 12
$ cat t-transpose.txt
10 20 30 40 50 60 70 80 90 A0 B0 C0
11 21 31 41 51 61 71 81 91 A1 B1 C1
12 22 32 42 52 62 72 82 92 A2 B2 C2
13 23 33 43 53 63 73 83 93 A3 B3 C3
14 24 34 44 54 64 74 84 94 A4 B4 C4
15 25 35 45 55 65 75 85 95 A5 B5 C5
16 26 36 46 56 66 76 86 96 A6 B6 C6
17 27 37 47 57 67 77 87 97 A7 B7 C7
18 28 38 48 58 68 78 88 98 A8 B8 C8
19 29 39 49 59 69 79 89 99 A9 B9 C9
And then running the first chunk on the huge file looks like:
$ time awk -v chunk_start=1 -v chunk_size=1000 -f et.awk tenk.txt > tenk-transpose.txt
real 1m7.899s
user 1m5.173s
sys 0m2.552s
Doing that ten times with the next chunk_start set to 1001, etc. (and appending with >> to the output, of course) should finally give you the full transposed result.
There is a simple and quick algorithm based on sorting:
1) Make a pass through the input, prepending the row number and column number to each field. Output is a three-tuple of row, column, value for each cell in the matrix. Write the output to a temporary file.
2) Sort the temporary file by column, then row.
3) Make a pass through the sorted temporary file, reconstructing the transposed matrix.
The two outer passes are done by awk. The sort is done by the system sort. Here's the code:
$ echo '1 2 3
2 3 44
1 1 1' |
awk '{ for (i=1; i<=NF; i++) print i, NR, $i}' |
sort -n |
awk ' NR>1 && $2==1 { print "" }; { printf "%s ", $3 }; END { print "" }'
1 2 1
2 3 1
3 44 1

Shell Script - Filtering text files by column that will & will not exist

I want to filter a directory that is full of log files according to the 8th column according to user's input (eg: 1218738496) and output to a text file. I have a working solution, but l am looking for a better solution that offers better performance, as the total file size may exceed 1GB+.
Problem 1:
Format inconsistencies in some lines.
Problem 2:
If the line's 8th column matches the input, the lines below it (that do not contain INSERT) should be output to file as well.
Sample data
ACTION,INSTALLATION_ID,LOG_TIMESTAMP_SECONDS,LOG_TIMESTAMP_FRACTIONS,LOG_TIMESTAMP,THREAD_ID,SEQUENCE_NUMBER,LOG_LEVEL_TYPE
INSERT,SLT_TEST_1,2015/06/02 14:07:26.860 (Asia/Colombo),1127192896,0,DEBUG3
0010: 69 6c 65 40 10 92 0f 0e 67 b9 72 aa 5d e1 03 63
]",,default,false
INSERT,SLT_TEST_1,2015/06/02 14:07:13.305 (Asia/Colombo),1127192896,1,DEBUG1
INSERT,SLT_TEST_1,2015/06/02 14:07:26.860 (Asia/Colombo),1218738496,14,DEBUG3
<v s=""MONTHLY_PEAK_DWNLOAD""/>
</a><a n=""thresholdScheme""><o t=""PM_UsageMonitorConfigThreshold"">
INSERT,SLT_TEST_1,2015/06/02 14:07:26.860 (Asia/Colombo),1218738496,15,DEBUG3
0010: 69 6c 65 40 10 92 0f 0e 67 b9 72 aa 5d e1 03 63
]",,default,false
INSERT,SLT_TEST_1,2015/06/02 14:07:26.860 (Asia/Colombo),1218738496,17,DEBUG3
Desired output
INSERT,SLT_TEST_1,2015/06/02 14:07:26.860 (Asia/Colombo),1218738496,14,DEBUG3
<v s=""MONTHLY_PEAK_DWNLOAD""/>
</a><a n=""thresholdScheme""><o t=""PM_UsageMonitorConfigThreshold"">
INSERT,SLT_TEST_1,2015/06/02 14:07:26.860 (Asia/Colombo),1218738496,15,DEBUG3
0010: 69 6c 65 40 10 92 0f 0e 67 b9 72 aa 5d e1 03 63
]",,default,false
INSERT,SLT_TEST_1,2015/06/02 14:07:26.860 (Asia/Colombo),1218738496,17,DEBUG3
My current working script
for file in $(ls -rt $directory)
do
echo "Reading file : " $file
# || [[ -n "$line" ]] <-- prevent last line being ignored if doesn't end with newline
while IFS= read -r line || [[ -n "$line" ]]
do
# if line contains INSERT
if [[ $line == *"INSERT"* ]]
then
# Break it to access the thread ID
breakdown=(${line//,/ })
threadID=${breakdown[4]}
if [[ $threadID == "$inputThreadID" ]]
then
seqID=${breakdown[5]}
echo $line >> ./output_unsorted.txt
fi
else
# The "too long lines" check if they belong to the ID log we want
if [ "$threadID" == "$inputThreadID" ] && [[ $line != *"ACTION,INSTALLATION_ID"* ]]
then
if [ "$lastSeqID" != "$seqID" ]
then
echo $line >> ./output_unsorted.txt
else
echo $line >> ./output_unsorted.txt
fi
fi
fi
done < "$directory/$file"
done
Using awk
This produces the output that you ask for:
$ awk -F, '/INSERT/{f=0} $4==1218738496{f=1} f' file
INSERT,SLT_TEST_1,2015/06/02 14:07:26.860 (Asia/Colombo),1218738496,14,DEBUG3
<v s=""MONTHLY_PEAK_DWNLOAD""/>
</a><a n=""thresholdScheme""><o t=""PM_UsageMonitorConfigThreshold"">
INSERT,SLT_TEST_1,2015/06/02 14:07:26.860 (Asia/Colombo),1218738496,15,DEBUG3
0010: 69 6c 65 40 10 92 0f 0e 67 b9 72 aa 5d e1 03 63
]",,default,false
INSERT,SLT_TEST_1,2015/06/02 14:07:26.860 (Asia/Colombo),1218738496,17,DEBUG3
How it works:
-F,
Set the input field separator to a comma.
/INSERT/{f=0}
If the line contains INSERT, we set flag f to zero (false).
$4==1218738496{f=1}
If the fourth field is your selected number, then we set the flag f to one (true).
f
If f is true, print the line.
Using bash
This uses very similar logic and produces the same output but uses bash:
#!/bin/bash
f=
while IFS= read line
do
[[ $line == *"INSERT"* ]] && f=
IFS=, read a b c d rest <<<"$line"
[ "$d" = 1218738496 ] && f=1
[ "$f" ] && echo "$line"
done <file

How do you delete all lines that contain double quotes in sh?

I tried sed -ne '/\"/!p' theinput > theproductbut that got me nowhere. It didn't do anything. What can I try?
You don't need to escape quote. Write:
sed '/"/d' theinput > theproduct
or
sed -i '/"/d' theinput
to alter the file directly.
In case you have other quotes as #Jonathan Leffler suggests, you have to find out which ones. Then, using \x you can achieve what you want. \x is used to specify hexadecimal values.
sed -i '/\x22/d' theinput
The line above would delete all rows in theinput containing the ordinary (ASCII 34) quote. You'll have to try the code points Jonathan suggested.
try this:
grep -v '"' theinput > theproduct
The command you showed us should have worked.
$ cat theinput
foo"bar
foo.bar
$ sed -ne '/\"/!p' theinput > theproduct
$ cat theproduct
foo.bar
$
unless you're using csh or tcsh as your interactive shell. In that case, you'd need to escape the ! character, even within quotation marks:
% cat theinput
foo"bar
foo.bar
% sed -ne '/\"/!p' theinput > theproduct
sed -ne '/"/pwd' theinput > theproduct
sed: -e expression #1, char 5: extra characters after command
% rm theproduct
% sed -ne '/\"/\!p' theinput > theproduct
% cat theproduct
foo.bar
%
But that's inconsistent with your statement that "It didn't do anything", so it's not clear what's really going on (and the question is tagged bourne-shell anyway).
But there are much simpler ways to accomplish the same task, particularly the grep command suggested by #Mike Sokolov.
Are you sure you have 'ASCII' input? Could you have Unicode (UTF-8) with characters that are not not ASCII 34, or Unicode U+0022, but something else?
Alternative Unicode 'double quotes' could be:
U+2033 DOUBLE PRIME; U+201C LEFT DOUBLE QUOTATION MARK;
U+201D RIGHT DOUBLE QUOTATION MARK;
U+201F DOUBLE HIGH-REVERSED-9 QUOTATION MARK;
U+02DD DOUBLE ACUTE ACCENT;
(and there could easily be others I've left out).
You can look to debug this with the od command:
$ cat theinput
No double quote here
Double quote " here
Unicode pseudo-double-quotes include “”‟″˝.
$ od -c theinput
0000000 N o d o u b l e q u o t e
0000020 h e r e \n D o u b l e q u o t
0000040 e " h e r e \n U n i c o d e
0000060 p s e u d o - d o u b l e - q
0000100 u o t e s i n c l u d e “ **
0000120 ** ” ** ** ‟ ** ** ″ ** ** ˝ ** . \n
0000136
$ od -x theinput
0000000 6f4e 6420 756f 6c62 2065 7571 746f 2065
0000020 6568 6572 440a 756f 6c62 2065 7571 746f
0000040 2065 2022 6568 6572 550a 696e 6f63 6564
0000060 7020 6573 6475 2d6f 6f64 6275 656c 712d
0000100 6f75 6574 2073 6e69 6c63 6475 2065 80e2
0000120 e29c 9d80 80e2 e29f b380 9dcb 0a2e
0000136
$ odx theinput
0x0000: 4E 6F 20 64 6F 75 62 6C 65 20 71 75 6F 74 65 20 No double quote
0x0010: 68 65 72 65 0A 44 6F 75 62 6C 65 20 71 75 6F 74 here.Double quot
0x0020: 65 20 22 20 68 65 72 65 0A 55 6E 69 63 6F 64 65 e " here.Unicode
0x0030: 20 70 73 65 75 64 6F 2D 64 6F 75 62 6C 65 2D 71 pseudo-double-q
0x0040: 75 6F 74 65 73 20 69 6E 63 6C 75 64 65 20 E2 80 uotes include ..
0x0050: 9C E2 80 9D E2 80 9F E2 80 B3 CB 9D 2E 0A ..............
0x005E:
$ sed '/"/d' theinput > theproduct
$ cat theproduct
No double quote here
Unicode pseudo-double-quotes include “”‟″˝.
$
(odx is my own command for dumping data in hex.)

Change the local setting to enable sed work correctly, but why?

The following is a bash file I wrote to convert all C++ style(//) comments in a C file to C style(/**/).
#!/bin/bash
lang=`echo $LANG`
# It's necessary to change the local setting. I don't know why.
export LANG=C
# Can comment the following statement if there is not dos2unix command.
dos2unix -q $1
sed -i -e 's;^\([[:blank:]]*\)//\(.*\);\1/* \2 */;' $1
export LANG=$lang
It works. But I found a problem I cannot explain. In default, my local setting is en_US.UTF-8. And in my C code, there are comments written in Chinese, such as
// some english 一些中文注释
If I don't change the local setting, i.e., do not run the statement export LANG=C, I'll get
/* some english */一些中文注释
instead of
/* some english 一些中文注释*/
I don't know why. I just find a solution by try and error.
After read Jonathan Leffler's answer, I think I've make some mistake leading to some misunderstand. In the question, those Chinese words were inputed in Google Chrome and were not the actual words in my C file. 一些中文注释 just means some Chinese comments.
Now I inputed // some english 一些中文注释 in Visual C++ 6.0 in Windows XP, and copied the c file to Debian. Then I just run sed -i -e 's;^([[:blank:]])//(.);\1/ \2 /;' $1 and got
/* some english 一些 */中文注释
I think it's different character coding(GB18030, GBK, UTF-8?) cause the different results.
The following is my results gotten on Debian
~/sandbox$ uname -a
Linux xyt-dev 2.6.30-1-686 #1 SMP Sat Aug 15 19:11:58 UTC 2009 i686 GNU/Linux
~/sandbox$ echo $LANG
en_US.UTF-8
~/sandbox$ cat tt.c | od -c -t x1
0000000 / / s o m e e n g l i s h
2f 2f 20 73 6f 6d 65 20 65 6e 67 6c 69 73 68 20
0000020 322 273 320 251 326 320 316 304 327 242 312 315
d2 bb d0 a9 d6 d0 ce c4 d7 a2 ca cd
0000034
~/sandbox$ ./convert_comment_style_cpp2c.sh tt.c
~/sandbox$ cat tt.c | od -c -t x1
0000000 / * s o m e e n g l i s h
2f 2a 20 20 73 6f 6d 65 20 65 6e 67 6c 69 73 68
0000020 322 273 320 251 * / 326 320 316 304 327 242 312 315
20 d2 bb d0 a9 20 2a 2f d6 d0 ce c4 d7 a2 ca cd
0000040
~/sandbox$
I think these Chinese Character encoding with 2 byte(Unicode).
There are another example:
~/sandbox$ cat tt.c | od -c -t x1
0000000 / / I n W i n d o w : 250 250 ?
2f 2f 20 49 6e 57 69 6e 64 6f 77 3a 20 a8 a8 3f
0000020 1 ?
31 3f
0000022
~/sandbox$ ./convert_comment_style_cpp2c.sh tt.c
~/sandbox$ cat tt.c | od -c -t x1
0000000 / * I n W i n d o w : *
2f 2a 20 20 49 6e 57 69 6e 64 6f 77 3a 20 20 2a
0000020 / 250 250 ? 1 ?
2f a8 a8 3f 31 3f
Which platform are you working on? Your sed script works fine on MacOS X without changing locale. The Linux terminal was less happy with the Chinese characters, but it is not setup to use UTF-8. Moreover, a hex dump of the string that it did get contained a zero byte 0x00 where the Chinese started, which might lead to the confusion. (I note that your regex adds a space before the comment text if it starts // with a space.)
MacOS X (10.6.8)
The 'odx' command use is a hex-dump program.
$ echo "// some english 一些中文注释" > x3.utf8
$ odx x3.utf8
0x0000: 2F 2F 20 73 6F 6D 65 20 65 6E 67 6C 69 73 68 20 // some english
0x0010: E4 B8 80 E4 BA 9B E4 B8 AD E6 96 87 E6 B3 A8 E9 ................
0x0020: 87 8A 0A ...
0x0023:
$ utf8-unicode x3.utf8
0x2F = U+002F
0x2F = U+002F
0x20 = U+0020
0x73 = U+0073
0x6F = U+006F
0x6D = U+006D
0x65 = U+0065
0x20 = U+0020
0x65 = U+0065
0x6E = U+006E
0x67 = U+0067
0x6C = U+006C
0x69 = U+0069
0x73 = U+0073
0x68 = U+0068
0x20 = U+0020
0xE4 0xB8 0x80 = U+4E00
0xE4 0xBA 0x9B = U+4E9B
0xE4 0xB8 0xAD = U+4E2D
0xE6 0x96 0x87 = U+6587
0xE6 0xB3 0xA8 = U+6CE8
0xE9 0x87 0x8A = U+91CA
0x0A = U+000A
$ sed 's;^\([[:blank:]]*\)//\(.*\);\1/* \2 */;' x3.utf8
/* some english 一些中文注释 */
$
All of which looks clean and tidy.
Linux (RHEL 5)
I copied the x3.utf8 file to a Linux box, and dumped it. Then I ran the sed script on it, and all seemed OK:
$ odx x3.utf8
0x0000: 2F 2F 20 73 6F 6D 65 20 65 6E 67 6C 69 73 68 20 // some english
0x0010: E4 B8 80 E4 BA 9B E4 B8 AD E6 96 87 E6 B3 A8 E9 ................
0x0020: 87 8A 0A ...
0x0023:
$ sed 's;^\([[:blank:]]*\)//\(.*\);\1/* \2 */;' x3.utf8 | odx
0x0000: 2F 2A 20 20 73 6F 6D 65 20 65 6E 67 6C 69 73 68 /* some english
0x0010: 20 E4 B8 80 E4 BA 9B E4 B8 AD E6 96 87 E6 B3 A8 ...............
0x0020: E9 87 8A 20 2A 2F 0A ... */.
0x0027:
$
So far, so good. I also tried:
$ echo $LANG
en_US.UTF-8
$ echo $LC_CTYPE
$ env | grep LC_
$ bash --version
GNU bash, version 3.2.25(1)-release (x86_64-redhat-linux-gnu)
Copyright (C) 2005 Free Software Foundation, Inc.
$ cat x3.utf8
// some english 一些中文注释
$ echo $(<x3.utf8)
// some english 一些中文注释
$ sed 's;^\([[:blank:]]*\)//\(.*\);\1/* \2 */;' x3.utf8
/* some english 一些中文注释 */
$
So, the terminal is nominally working in UTF-8 after all, and it certainly seems display the data OK.
However, if I echo the string at the terminal, it gets into a tizzy. When I cut'n'pasted the string to the Linux terminal, it said:
$ echo "// some english d8d^G:
> "
// some english d8d:
$
and beeped.
$ echo "// some english d8d^G:
> " | odx
0x0000: 2F 2F 20 73 6F 6D 65 20 65 6E 67 6C 69 73 68 20 // some english
0x0010: 64 38 64 07 3A 0A 0A d8d.:..
0x0017:
$
I'm not quite sure what to make of that. I think it means that something in the input side of bash is having some problems, but I'm not quite sure. I also am getting slightly inconsistent results. The first time I tried it, I got:
$ cat > xxx
's;^\([[:blank:]]*\)//\(.*\);\1/* \2 */;'
// some english d8^#d:^[d8-f^Gf3(i^G
$ odx xxx
0x0000: 27 73 3B 5E 5C 28 5B 5B 3A 62 6C 61 6E 6B 3A 5D 's;^\([[:blank:]
0x0010: 5D 2A 5C 29 2F 2F 5C 28 2E 2A 5C 29 3B 5C 31 2F ]*\)//\(.*\);\1/
0x0020: 2A 20 5C 32 20 2A 2F 3B 27 0A 2F 2F 20 73 6F 6D * \2 */;'.// som
0x0030: 65 20 65 6E 67 6C 69 73 68 20 64 38 00 64 3A 1B e english d8.d:.
0x0040: 64 38 2D 66 07 66 33 28 69 07 0A 0A d8-f.f3(i...
0x004C:
$
And in that hex dump, you can see a 0x00 byte (offset 0x003C). That appears at the position where you got the end comment, and a null there could confuse sed; but the whole input is such a mess it is hard to know what to make of it.
Okay, here's the correct answer...
The GNU regular expression library (regex) doesn't match everything when you put a . in your expression. Yup, I know how braindead that sounds.
The problem comes from the word "character", now reasonable people will say that everything that's in the input file for sed is characters. And even in your case they are perfectly correct. But regex has been programmed to required that the input be perfectly correctly formatted characters of the current locale character set (UTF-8) if they're correctly formatted characters for the Windows character set (UTF-16) they're not "characters".
So as . only matches "characters" it doesn't match your characters.
If you used the regex //.*$, ie: pinned it to the end of the line it wouldn't match at all because there's something that's not a "character" between the // and the end of the line.
And no you can't do anything like //\(.\|[^.]\)*$, it's just impossible to match those characters without switching to the C locale.
This will also, sometimes, destroy 8-bit transparency; ie: a binary piped through sed will get corrupted even if no changes are made.
Fortunately the C locale still uses the reasonable interpretation so anything that's not a perfectly correctly formatted ASCII-68 character is still a "character".

Resources