Natural Sorting algorithm - algorithm

How do you sort an array of strings naturally in different programming languages? Post your implementation and what language it is in in the answer.

Here's how you can get explorer-like behaviour in Python:
#!/usr/bin/env python
"""
>>> items = u'a1 a003 b2 a2 a10 1 10 20 2 c100'.split()
>>> items.sort(explorer_cmp)
>>> for s in items:
... print s,
1 2 10 20 a1 a2 a003 a10 b2 c100
>>> items.sort(key=natural_key, reverse=True)
>>> for s in items:
... print s,
c100 b2 a10 a003 a2 a1 20 10 2 1
"""
import re
def natural_key(astr):
"""See http://www.codinghorror.com/blog/archives/001018.html"""
return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', astr)]
def natural_cmp(a, b):
return cmp(natural_key(a), natural_key(b))
try: # use explorer's comparison function if available
import ctypes
explorer_cmp = ctypes.windll.shlwapi.StrCmpLogicalW
except (ImportError, AttributeError):
# not on Windows or old python version
explorer_cmp = natural_cmp
if __name__ == '__main__':
import doctest; doctest.testmod()
To support Unicode strings, .isdecimal() should be used instead of .isdigit().
.isdigit() may also fail (return value that is not accepted by int()) for a bytestring on Python 2 in some locales e.g., '\xb2' ('²') in cp1252 locale on Windows.

JavaScript
Array.prototype.alphanumSort = function(caseInsensitive) {
for (var z = 0, t; t = this[z]; z++) {
this[z] = [], x = 0, y = -1, n = 0, i, j;
while (i = (j = t.charAt(x++)).charCodeAt(0)) {
var m = (i == 46 || (i >=48 && i <= 57));
if (m !== n) {
this[z][++y] = "";
n = m;
}
this[z][y] += j;
}
}
this.sort(function(a, b) {
for (var x = 0, aa, bb; (aa = a[x]) && (bb = b[x]); x++) {
if (caseInsensitive) {
aa = aa.toLowerCase();
bb = bb.toLowerCase();
}
if (aa !== bb) {
var c = Number(aa), d = Number(bb);
if (c == aa && d == bb) {
return c - d;
} else return (aa > bb) ? 1 : -1;
}
}
return a.length - b.length;
});
for (var z = 0; z < this.length; z++)
this[z] = this[z].join("");
}
Source

For MySQL, I personally use code from a Drupal module, which is available at hhttp://drupalcode.org/project/natsort.git/blob/refs/heads/5.x-1.x:/natsort.install.mysql
Basically, you execute the posted SQL script to create functions, and then use ORDER BY natsort_canon(field_name, 'natural')
Here's a readme about the function:
http://drupalcode.org/project/natsort.git/blob/refs/heads/5.x-1.x:/README.txt

Here's a cleanup of the code in the article the question linked to:
def sorted_nicely(strings):
"Sort strings the way humans are said to expect."
return sorted(strings, key=natural_sort_key)
def natural_sort_key(key):
import re
return [int(t) if t.isdigit() else t for t in re.split(r'(\d+)', key)]
But actually I haven't had occasion to sort anything this way.

If the OP is asking about idomatic sorting expressions, then not all languages have a natural expression built in. For c I'd go to <stdlib.h> and use qsort. Something on the lines of :
/* non-functional mess deleted */
to sort the arguments into lexical order. Unfortunately this idiom is rather hard to parse for those not used the ways of c.
Suitably chastened by the downvote, I actually read the linked article. Mea culpa.
In anycase the original code did not work, except in the single case I tested. Damn. Plain vanilla c does not have this function, nor is it in any of the usual libraries.
The code below sorts the command line arguments in the natural way as linked. Caveat emptor as it is only lightly tested.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
int naturalstrcmp(const char **s1, const char **s2);
int main(int argc, char **argv){
/* Sort the command line arguments in place */
qsort(&argv[1],argc-1,sizeof(char*),
(int(*)(const void *, const void *))naturalstrcmp);
while(--argc){
printf("%s\n",(++argv)[0]);
};
}
int naturalstrcmp(const char **s1p, const char **s2p){
if ((NULL == s1p) || (NULL == *s1p)) {
if ((NULL == s2p) || (NULL == *s2p)) return 0;
return 1;
};
if ((NULL == s2p) || (NULL == *s2p)) return -1;
const char *s1=*s1p;
const char *s2=*s2p;
do {
if (isdigit(s1[0]) && isdigit(s2[0])){
/* Compare numbers as numbers */
int c1 = strspn(s1,"0123456789"); /* Could be more efficient here... */
int c2 = strspn(s2,"0123456789");
if (c1 > c2) {
return 1;
} else if (c1 < c2) {
return -1;
};
/* the digit strings have equal length, so compare digit by digit */
while (c1--) {
if (s1[0] > s2[0]){
return 1;
} else if (s1[0] < s2[0]){
return -1;
};
s1++;
s2++;
};
} else if (s1[0] > s2[0]){
return 1;
} else if (s1[0] < s2[0]){
return -1;
};
s1++;
s2++;
} while ( (s1!='\0') || (s2!='\0') );
return 0;
}
This approach is pretty brute force, but it is simple and can probably be duplicated in any imperative language.

I just use StrCmpLogicalW. It does exactly what Jeff is wanting, since it's the same API that explorer uses. Admittedly, it's not portable.
In C++:
bool NaturalLess(const wstring &lhs, const wstring &rhs)
{
return StrCmpLogicalW(lhs.c_str(), rhs.c_str()) < 0;
}
vector<wstring> strings;
// ... load the strings
sort(strings.begin(), strings.end(), &NaturalLess);

Just a link to some nice work in Common Lisp by Eric Normand:
http://www.lispcast.com/wordpress/2007/12/human-order-sorting/

In C, this solution correctly handles numbers with leading zeroes:
#include <stdlib.h>
#include <ctype.h>
/* like strcmp but compare sequences of digits numerically */
int strcmpbynum(const char *s1, const char *s2) {
for (;;) {
if (*s2 == '\0')
return *s1 != '\0';
else if (*s1 == '\0')
return 1;
else if (!(isdigit(*s1) && isdigit(*s2))) {
if (*s1 != *s2)
return (int)*s1 - (int)*s2;
else
(++s1, ++s2);
} else {
char *lim1, *lim2;
unsigned long n1 = strtoul(s1, &lim1, 10);
unsigned long n2 = strtoul(s2, &lim2, 10);
if (n1 > n2)
return 1;
else if (n1 < n2)
return -1;
s1 = lim1;
s2 = lim2;
}
}
}
If you want to use it with qsort, use this auxiliary function:
static int compare(const void *p1, const void *p2) {
const char * const *ps1 = p1;
const char * const *ps2 = p2;
return strcmpbynum(*ps1, *ps2);
}
And you can do something on the order of
char *lines = ...;
qsort(lines, next, sizeof(lines[0]), compare);

In C++ I use this example code to do natural sorting. The code requires the boost library.

Note that for most such questions, you can just consult the Rosetta Code Wiki. I adapted my answer from the entry for sorting integers.
In a system's programming language doing something like this is generally going to be uglier than with a specialzed string-handling language. Fortunately for Ada, the most recent version has a library routine for just this kind of task.
For Ada 2005 I believe you could do something along the following lines (warning, not compiled!):
type String_Array is array(Natural range <>) of Ada.Strings.Unbounded.Unbounded_String;
function "<" (L, R : Ada.Strings.Unbounded.Unbounded_String) return boolean is
begin
--// Natural ordering predicate here. Sorry to cheat in this part, but
--// I don't exactly grok the requirement for "natural" ordering. Fill in
--// your proper code here.
end "<";
procedure Sort is new Ada.Containers.Generic_Array_Sort
(Index_Type => Natural;
Element_Type => Ada.Strings.Unbounded.Unbounded_String,
Array_Type => String_Array
);
Example use:
using Ada.Strings.Unbounded;
Example : String_Array := (To_Unbounded_String ("Joe"),
To_Unbounded_String ("Jim"),
To_Unbounded_String ("Jane"),
To_Unbounded_String ("Fred"),
To_Unbounded_String ("Bertha"),
To_Unbounded_String ("Joesphus"),
To_Unbounded_String ("Jonesey"));
begin
Sort (Example);
...
end;

Python, using itertools:
def natural_key(s):
return tuple(
int(''.join(chars)) if isdigit else ''.join(chars)
for isdigit, chars in itertools.groupby(s, str.isdigit)
)
Result:
>>> natural_key('abc-123foo456.xyz')
('abc-', 123, 'foo', 456, '.xyz')
Sorting:
>>> sorted(['1.1.1', '1.10.4', '1.5.0', '42.1.0', '9', 'banana'], key=natural_key)
['1.1.1', '1.5.0', '1.10.4', '9', '42.1.0', 'banana']

My implementation on Clojure 1.1:
(ns alphanumeric-sort
(:import [java.util.regex Pattern]))
(defn comp-alpha-numerical
"Compare two strings alphanumerically."
[a b]
(let [regex (Pattern/compile "[\\d]+|[a-zA-Z]+")
sa (re-seq regex a)
sb (re-seq regex b)]
(loop [seqa sa seqb sb]
(let [counta (count seqa)
countb (count seqb)]
(if-not (not-any? zero? [counta countb]) (- counta countb)
(let [c (first seqa)
d (first seqb)
c1 (read-string c)
d1 (read-string d)]
(if (every? integer? [c1 d1])
(def result (compare c1 d1)) (def result (compare c d)))
(if-not (= 0 result) result (recur (rest seqa) (rest seqb)))))))))
(sort comp-alpha-numerical ["a1" "a003" "b2" "a10" "a2" "1" "10" "20" "2" "c100"])
Result:
("1" "2" "10" "20" "a1" "a2" "a003" "a10" "b2" "c100")

For Tcl, the -dict (dictionary) option to lsort:
% lsort -dict {a b 1 c 2 d 13}
1 2 13 a b c d

php has a easy function "natsort" to do that,and I implements it by myself:
<?php
$temp_files = array('+====','-==',"temp15-txt","temp10.txt",
"temp1.txt","tempe22.txt","temp2.txt");
$my_arr = $temp_files;
natsort($temp_files);
echo "Natural order: ";
print_r($temp_files);
echo "My Natural order: ";
usort($my_arr,'my_nat_func');
print_r($my_arr);
function is_alpha($a){
return $a>='0'&&$a<='9' ;
}
function my_nat_func($a,$b){
if(preg_match('/[0-9]/',$a)){
if(preg_match('/[0-9]/',$b)){
$i=0;
while(!is_alpha($a[$i])) ++$i;
$m = intval(substr($a,$i));
$i=0;
while(!is_alpha($b[$i])) ++$i;
$n = intval(substr($b,$i));
return $m>$n?1:($m==$n?0:-1);
}
return 1;
}else{
if(preg_match('/[0-9]/',$b)){
return -1;
}
return $a>$b?1:($a==$b?0:-1);
}
}

Java solution:-
This can be achieved by implementing new Comparator<String> and pass it to Collections.sort(list, comparator) method.
#Override
public int compare(String s1, String s2) {
int len1 = s1.length();
int len2 = s2.length();
int lim = Math.min(len1, len2);
char v1[] = s1.toCharArray();
char v2[] = s2.toCharArray();
int k = 0;
while (k < lim) {
char c1 = v1[k];
char c2 = v2[k];
if (c1 != c2) {
if(this.isInteger(c1) && this.isInteger(c2)) {
int i1 = grabContinousInteger(v1, k);
int i2 = grabContinousInteger(v2, k);
return i1 - i2;
}
return c1 - c2;
}
k++;
}
return len1 - len2;
}
private boolean isInteger(char c) {
return c >= 48 && c <= 57; // ascii value 0-9
}
private int grabContinousInteger(char[] arr, int k) {
int i = k;
while(i < arr.length && this.isInteger(arr[i])) {
i++;
}
return Integer.parseInt(new String(arr, k, i - k));
}

Related

Any faster way to replace substring in AWK

I have a long string of about 50,000,000 long... , and I am substituting it part by part
cat FILE | tail -n+2 | awk -v k=100 '{
i = 1
while (i<length($0)-k+1) {
x = substr($0, i, k)
if (CONDITION) {
x changed sth
$0 = substr($0,1,i-1) x substr($0,i+k)
}
i += 1
}
gsub(sth,sth,$0)
printf("%s",$0) >> FILE
}'
Are there any ways to replace $0 at position i with x of length k without using this method?
The string is too long and the commands runs extremely slow
sample input:
NNNNNNNNNNggcaaacagaatccagcagcacatcaaaaagcttatccacAGTAATTCATTATATCAAAATGCTCCAggccaggcgtggtggcttatgcc
sample output:
NNNNNNNNNNggcnnncngnnnccngcngcncnncnnnnngcnnnnccncNGNNNNNCNNNNNNNCNNNNNGCNCCNggccnggcgnggnggcnnnngcc
If substring with length k=10 contains >50% of A || a || T || t
(so there are length($0)-k+1 substrings)
substitute A and T with N, a and t with n
The $0 string must maintain it size and sequence (Case sensitive)
EDIT:
I misunderstood the requirement of this problem, and repost the question at here.
Basically:
read a window of characters to two buffers - scratch buffer and output buffer
if in the scratch buffer there are more then some count of characters ATat
then replace all characters ATat in the output buffer buffer to Nn respectively
output one character from the output buffer
flush one character in both buffers
and go to step 1 to repeat reading the characters into buffers
when the end of line is encountered, just flush output buffer and reset it all
A small C program for sure is going to be the fastest:
// The window size
#define N 10
// The percent of the window that has to be equal to one of [AaTt]
#define PERCENT 50
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
// output a string
static void output(char *outme, size_t n) {
fwrite(outme, n, 1, stdout);
}
// is one of [AaTt]
static bool is_one_of_them(char c) {
switch(c) {
case 'A':
case 'a':
case 'T':
case 't':
return true;
}
return false;
}
// Convert one of characters to n/N depending on case
static char convert_them_to_n(char c) {
// switch(c){ case 'T': case 'A': return true; } return false;
// ASCII is assumed
const char m = ~0x1f;
const char w = 'n' & ~m;
return (c & m) | w;
}
static const unsigned threshold = N * PERCENT / 100;
// Store the input in buf
static char buf[N];
// Store the output to-be-outputted in out
static char out[N];
// The current position in buf and out
// The count of readed characters
static size_t pos;
// The count of one of searched characters in buf
static unsigned count_them;
static void buf_reset(void) {
pos = 0;
count_them = 0;
}
static void buf_flush(void) {
output(out, pos);
buf_reset();
}
static void buf_replace_them(void) {
// TODO: this could keep count of characters alrady replaced in out to save CPU
for (size_t i = 0; i < N; ++i) {
if (is_one_of_them(out[i])) {
out[i] = convert_them_to_n(out[i]);
}
}
}
static void buf_flush_one(void) {
assert(pos > 0);
assert(pos == N);
output(out, 1);
count_them -= is_one_of_them(buf[0]);
memmove(buf, buf + 1, pos - 1);
memmove(out, out + 1, pos - 1);
pos--;
}
static void buf_add(char c) {
buf[pos] = out[pos] = c;
pos++;
count_them += is_one_of_them(c);
// if we reached the substring length
if (pos == N) {
// if the count reached the threshold
if (count_them >= threshold) {
// convert the characters to n
buf_replace_them();
}
// flush one character only at a time
buf_flush_one();
}
}
int main() {
int c;
buf_reset();
while ((c = getchar()) != EOF) {
if (c == '\n') {
// If its a newline, just flush what we have buffered
buf_flush();
output("\n", 1);
continue;
}
buf_add(c);
}
buf_flush();
}
Such a C program is easily transferable to for example an awk script, just one need to read one character at a time. Below I split the characters with split, like:
awk -v N=10 -v percent=50 '
BEGIN{ threshold = N * percent / 100; pos=0 }
function is_one_of_them(c) {
return c ~ /^[aAtT]$/;
}
function buf_flush(i) {
for (i = 0; i < pos; ++i) {
printf "%s", out[i]
}
pos = 0
count_them = 0
}
function buf_replace_them(i) {
for (i = 0; i < pos; ++i) {
if (is_one_of_them(out[i])) {
out[i] = out[i] ~ /[AT]/ ? "N" : "n";
}
}
}
function buf_flush_one(i) {
printf "%s", out[0]
count_them -= is_one_of_them(buf[0])
if(0 && debug) {
printf(" count_them %s ", count_them)
for (i = 0; i < pos-1; ++i) {
printf("%s", buf[i+1])
} printf(" ");
for (i = 0; i < pos-1; ++i) {
printf("%s", out[i+1])
}
printf("\n");
}
for (i = 0; i < pos-1; ++i) {
buf[i] = buf[i+1]
out[i] = out[i+1]
}
pos--
}
function buf_add(c) {
buf[pos]=c; out[pos]=c; pos++
count_them += is_one_of_them(c)
if (pos == N) {
if (count_them >= threshold) {
buf_replace_them()
}
buf_flush_one()
}
}
{
split($0, chars, "")
for (idx = 0; idx <= length($0); idx++) {
buf_add(chars[idx])
}
buf_flush();
printf "\n";
}
'
Both programs when run with the input presented in the first line produce the output presented in the second line (note that lone a near the end is not replaced, because there are no 5 charactets ATat in a window of 10 characters from it):
NNNNNNNNNNggcaaacagaatccagcagcacatcaaaaagcttatccacAGTAATTCATTATATCAAAATGCTCCAggccaggcgtggtggcttatgcc
NNNNNNNNNNggcnnncngnnnccngcngcncnncnnnnngcnnnnccncNGNNNNNCNNNNNNNCNNNNNGCNCCNggccaggcgnggnggcnnnngcc
Both solutions were tested on repl.
You need to be careful with how you address this problem. You cannot work on the substituted string. You need to keep track of the original string. Here is a simple example. Assume we have a string consisting of x and y and we want to replace all y with z if there are 8 y in a substring of 10. Imagine your input looks like:
yyyyyyyyxxy
The first substring of 10 reads yyyyyyyyxx and would be translated into zzzzzzzzxx. If you perform the substitution directly into the original string, you get zzzzzzzzxxy. The second substring now reads zzzzzzzxxy, and does not contain 8 times y, while in the original string it does. So according to the solution of the OP, this would lead into inconsistent results, depending on if you start from the front or the back. So a quick solution would be:
awk -v N=10 -v p=50 '
BEGIN { n = N*p/100 }
{ s = $0 }
{ for(i=1;i<=length-N;++i) {
str=substr($0,i,N)
c=gsub(/[AT]/,"N",str) + gsub(/[at]/,"n",str)
if(c >= n) s = substr(s,1,i-1) str substr(s,i+N)
}
}
{ print s }' file
There is ofcourse quite some work you do double here. Imagine you have a string of the form xxyyyyyyyyxx, you would perform 4 concatinations while you only need to do one. So the best idea is to minimalise the work and only check the substrings which end with the respective character:
awk -v N=10 -v p=50 '
BEGIN { n = N*p/100 }
{ s = $0 }
{ i=N; while (match(substr($0,i),/[ATat]/)) {
str=substr($0,i+RSTART-N,N)
c=gsub(/[AT]/,"N",str) + gsub(/[at]/,"n",str)
if(c >= n) { s = substr(s,1,i+RSTART-N-1) str substr(s,i+RSTART)}
i=i+RSTART
}
}
{ print s }' file
To replace $0 at position i with x do:
awk 'BEGIN{i=12345;x="blubber"}
{
printf("%s",substr($0,1,i));
printf("%s",x);
printf("%s",substr($0,i+length(x)));
}'
I don't think there is any faster method.
To replace AGCT with N and agct with n use tr. To replace them only within a range and using awk you should do:
awk 'BEGIN{i=12345;n=123}
{
printf("%s",substr($0,1,i-1));
printf(gsub(/[atgc]/,"n",gsub(/[ATGC]/,"N",substr($0,i,i+n-1))));
printf("%s",substr($0,i+n));
}'
To do more advanced and faster processing you should consider c/c++.

Issue in making a String Algorithm

Given a string made up of 'a' and 'b' only,the operation that is allowed is to remove a substring of "abb" if present from the string. My question is after applying this operation any no of times can i make the string empty. I need a O(n) algorithm.
Example ,
abbabb-->yes
aabbbb->yes since aabbbb->abb->empty
aaabbb->no since aaabbb->aab
All that i can think upto now is an O(n^2) algorithm in which i sucessively find the position of the substring using substr() or find() and then remove it until string not empty or not found a "abb" in it.
Here is an example of what I suggested in the comment:
for i = 0 to word.length-1
if word[i] == 'b'
if stack.empty() //no corresponding a
return false
if stack.top() == 'a' //first b after an a
stack.push('b')
else //second b after an a
stack.pop() //pop last two letters
stack.pop()
else
stack.push('a')
return stack.empty()
There might be some boundary conditions that needs to be checked, and of course at any point pop() fails you need to return false. Seems to be working for the possible inputs that occurs to me.
The point that needs to be mathematically proved, I think, is the part where I commented "second b after an a". With the assumption that stack was empty at the beginning, if I did not miss anything that point looks correct.
It is not necessary to store anything but the count of unused pairs of b's at the end of the string, as you read it Right to Left. (And it's solved reading input only once, so O(n) time O(1) space) This is very reminiscent of finding a discrete finite automata for a regular language. If you see two b's, increase count. If you see a single b, add half a pair (update a boolean variable and possibly increment count). If you see an a and have no pair of b's, fail, else count--. If you reach the end of the string and there were no extra b's, the string was valid.
Make use of two counters to avoid using stack. Here is the c++ implementaion hope it works.
bool canBeDone(string s)
{
int aCount = 0;
int bCount = 0;
for(int i=0;i<s.length();++i)
{
if(s[i] == 'a')
{
aCount++;
continue;
}
if(s[i] == 'b' && aCount == 0)
return false;
else
{
bCount += 1;
if(bCount == 2)
{
bCount = 0;
aCount--;
}
}
}
if(!aCount && !bCount)return true;
return false;
}
Very simple and straightforward implementation in Erlang O(n) space and time (unfortunately even clwhisk's algorithm needs O(n) space in Erlang because of lists:reverse/1):
-module(abb).
-export([check/1, clwhisk/1, test/0]).
check(L) when is_list(L) ->
check(L, []).
check(L, "bba" ++ T) -> check(L, T);
check([H|T], S) -> check(T, [H|S]);
check([], S) -> S =:= [].
clwhisk(L) when is_list(L) ->
clwhisk(lists:reverse(L), 0).
clwhisk([$b|T], C) -> clwhisk(T, C+1);
clwhisk([$a|T], C) -> C >= 2 andalso clwhisk(T, C-2);
clwhisk(L, C) -> L =:= [] andalso C =:= 0.
test() ->
true = abb:check("abbabb"),
true = abb:check("aabbbb"),
false = abb:check("aaabbb"),
true = abb:check("ababbb"),
true = abb:clwhisk("abbabb"),
true = abb:clwhisk("aabbbb"),
false = abb:clwhisk("aaabbb"),
true = abb:clwhisk("ababbb"),
ok.
And there is C implementation of clwhisk's algorithm as filter:
#include <stdlib.h>
#include <stdio.h>
static inline const char *last(const char* s){
for(;*s && *s!='\n';s++);
return s-1;
}
static int check(const char* s){
int count=0;
const char *ptr = last(s);
for(; ptr >= s; ptr--)
if(*ptr == 'b') {
count++;
}
else if(*ptr == 'a') {
count -= 2;
if(count < 0)
return 0;
}
else return 0;
return count == 0;
}
int main(void) {
char *line = NULL;
size_t len = 0;
while( getline(&line, &len, stdin) != -1 )
if(*line && *line != '\n' && check(line))
fputs(line, stdout);
return EXIT_SUCCESS;
}

Filter only digit sequences containing a given set of digits

I have a large list of digit strings like this one. The individual strings are relatively short (say less than 50 digits).
data = [
'300303334',
'53210234',
'123456789',
'5374576807063874'
]
I need to find out a efficient data structure (speed first, memory second) and algorithm which returns only those strings that are composed of a given set of digits.
Example results:
filter(data, [0,3,4]) = ['300303334']
filter(data, [0,1,2,3,4,5]) = ['300303334', '53210234']
The data list will usually fit into memory.
For each digit, precompute a postings list that don't contain the digit.
postings = [[] for _ in xrange(10)]
for i, d in enumerate(data):
for j in xrange(10):
digit = str(j)
if digit not in d:
postings[j].append(i)
Now, to find all strings that contain, for example, just the digits [1, 3, 5] you can merge the postings lists for the other digits (ie: 0, 2, 4, 6, 7, 8, 9).
def intersect_postings(p0, p1):
i0, i1 = next(p0), next(p1)
while True:
if i0 == i1:
yield i0
i0, i1 = next(p0), next(p1)
elif i0 < i1: i0 = next(p0)
else: i1 = next(p1)
def find_all(digits):
p = None
for d in xrange(10):
if d not in digits:
if p is None: p = iter(postings[d])
else: p = intersect_postings(p, iter(postings[d]))
return (data[i] for i in p) if p else iter(data)
print list(find_all([0, 3, 4]))
print list(find_all([0, 1, 2, 3, 4, 5]))
A string can be encoded by a 10-bit number. There are 2^10, or 1,024 possible values.
So create a dictionary that uses an integer for a key and a list of strings for the value.
Calculate the value for each string and add that string to the list of strings for that value.
General idea:
Dictionary Lookup;
for each (string in list)
value = 0;
for each character in string
set bit N in value, where N is the character (0-9)
Lookup[value] += string // adds string to list for this value in dictionary
Then, to get a list of the strings that match your criteria, just compute the value and do a direct dictionary lookup.
So if the user asks for strings that contain only 3, 5, and 7:
value = (1 << 3) || (1 << 5) || (1 << 7);
list = Lookup[value];
Note that, as Matt pointed out in comment below, this will only return strings that contain all three digits. So, for example, it wouldn't return 37. That seems like a fatal flaw to me.
Edit
If the number of symbols you have to deal with is very large, then the number of possible combinations becomes too large for this solution to be practical.
With a large number of symbols, I'd recommend an inverted index as suggested in the comments, combined with a secondary filter that removes the strings that contain extraneous digits.
Consider a function f which constructs a bitmask for each string with bit i set if digit i is in the string.
For example,
f('0') = 0b0000000001
f('00') = 0b0000000001
f('1') = 0b0000000010
f('1100') = 0b0000000011
Then I suggest storing a list of strings for each bitmask.
For example,
Bitmask 0b0000000001 -> ['0','00']
Once you have prepared this data structure (which is the same size as your original list), you can then easily access all the strings for a particular filter by accessing all lists where the bitmask is a subset of the digits in your filter.
So for your example of filter [0,3,4] you would return the lists from:
Strings containing just 0
Strings containing just 3
Strings containing just 4
Strings containing 0 and 3
Strings containing 0 and 4
Strings containing 3 and 4
Strings containing 0 and 3 and 4
Example Python Code
from collections import defaultdict
import itertools
raw_data = [
'300303334',
'53210234',
'123456789',
'5374576807063874'
]
def preprocess(raw_data):
data = defaultdict(list)
for s in raw_data:
bitmask = 0
for digit in s:
bitmask |= 1<<int(digit)
data[bitmask].append(s)
return data
def filter(data,mask):
for r in range(len(mask)):
for m in itertools.combinations(mask,r+1):
bitmask = sum(1<<digit for digit in m)
for s in data[bitmask]:
yield s
data = preprocess(raw_data)
for a in filter(data, [0,1,2,3,4,5]):
print a
Just for kicks, I have coded up Jim's lovely algorithm and the Perl is here if anyone wants to play with it. Please do not accept this as an answer or anything, pass all credit to Jim:
#!/usr/bin/perl
use strict;
use warnings;
my $Debug=1;
my $Nwords=1000;
my ($word,$N,$value,$i,$j,$k);
my (#dictionary,%Lookup);
################################################################################
# Generate "words" with random number of characters 5-30
################################################################################
print "DEBUG: Generating $Nwords word dictionary\n" if $Debug;
for($i=0;$i<$Nwords;$i++){
$j = rand(25) + 5; # length of this word
$word="";
for($k=0;$k<$j;$k++){
$word = $word . int(rand(10));
}
$dictionary[$i]=$word;
print "$word\n" if $Debug;
}
# Add some obvious test cases
$dictionary[++$i]="0" x 50;
$dictionary[++$i]="1" x 50;
$dictionary[++$i]="2" x 50;
$dictionary[++$i]="3" x 50;
$dictionary[++$i]="4" x 50;
$dictionary[++$i]="5" x 50;
$dictionary[++$i]="6" x 50;
$dictionary[++$i]="7" x 50;
$dictionary[++$i]="8" x 50;
$dictionary[++$i]="9" x 50;
$dictionary[++$i]="0123456789";
################################################################################
# Encode words
################################################################################
for $word (#dictionary){
$value=0;
for($i=0;$i<length($word);$i++){
$N=substr($word,$i,1);
$value |= 1 << $N;
}
push(#{$Lookup{$value}},$word);
print "DEBUG: $word encoded as $value\n" if $Debug;
}
################################################################################
# Do lookups
################################################################################
while(1){
print "Enter permitted digits, separated with commas: ";
my $line=<STDIN>;
my #digits=split(",",$line);
$value=0;
for my $d (#digits){
$value |= 1<<$d;
}
print "Value: $value\n";
print join(", ",#{$Lookup{$value}}),"\n\n" if defined $Lookup{$value};
}
I like Jim Mischel's approach. It has pretty efficient look up and bounded memory usage. Code in C follows:
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <readline/readline.h>
#include <readline/history.h>
enum {
zero = '0',
nine = '9',
numbers = nine - zero + 1,
masks = 1 << numbers,
};
typedef uint16_t mask;
struct list {
char *s;
struct list *next;
};
typedef struct list list_cell;
typedef struct list *list;
static inline int is_digit(char c) { return c >= zero && c <= nine; }
static inline mask char2mask(char c) { return 1 << (c - zero); }
static inline mask add_char2mask(mask m, char c) {
return m | (is_digit(c) ? char2mask(c) : 0);
}
static inline int is_set(mask m, mask n) { return (m & n) != 0; }
static inline int is_set_char(mask m, char c) { return is_set(m, char2mask(c)); }
static inline int is_submask(mask sub, mask m) { return (sub & m) == sub; }
static inline char *sprint_mask(char buf[11], mask m) {
char *s = buf;
char i;
for(i = zero; i <= nine; i++)
if(is_set_char(m, i)) *s++ = i;
*s = 0;
return buf;
}
static inline mask get_mask(char *s) {
mask m=0;
for(; *s; s++)
m = add_char2mask(m, *s);
return m;
}
static inline int is_empty(list l) { return !l; }
static inline list insert(list *l, char *s) {
list cell = (list)malloc(sizeof(list_cell));
cell->s = s;
cell->next = *l;
return *l = cell;
}
static void *foreach(void *f(char *, void *), list l, void *init) {
for(; !is_empty(l); l = l->next)
init = f(l->s, init);
return init;
}
struct printer_state {
int first;
FILE *f;
};
static void *prin_list_member(char *s, void *data) {
struct printer_state *st = (struct printer_state *)data;
if(st->first) {
fputs(", ", st->f);
} else
st->first = 1;
fputs(s, st->f);
return data;
}
static void print_list(list l) {
struct printer_state st = {.first = 0, .f = stdout};
foreach(prin_list_member, l, (void *)&st);
putchar('\n');
}
static list *init_lu(void) { return (list *)calloc(sizeof(list), masks); }
static list *insert2lu(list lu[masks], char *s) {
mask i, m = get_mask(s);
if(m) // skip string without any number
for(i = m; i < masks; i++)
if(is_submask(m, i))
insert(lu+i, s);
return lu;
}
int usage(const char *name) {
fprintf(stderr, "Usage: %s filename\n", name);
return EXIT_FAILURE;
}
#define handle_error(msg) \
do { perror(msg); exit(EXIT_FAILURE); } while (0)
static inline void chomp(char *s) { if( (s = strchr(s, '\n')) ) *s = '\0'; }
list *load_file(FILE *f) {
char *line = NULL;
size_t len = 0;
ssize_t read;
list *lu = init_lu();
for(; (read = getline(&line, &len, f)) != -1; line = NULL) {
chomp(line);
insert2lu(lu, line);
}
return lu;
}
void read_reqs(list *lu) {
char *line;
char buf[11];
for(; (line = readline("> ")); free(line))
if(*line) {
add_history(line);
mask m = get_mask(line);
printf("mask: %s\nstrings: ", sprint_mask(buf, m));
print_list(lu[m]);
};
putchar('\n');
}
int main(int argc, const char* argv[] ) {
const char *name = argv[0];
FILE *f;
list *lu;
if(argc != 2) return usage(name);
f = fopen(argv[1], "r");
if(!f) handle_error("open");
lu = load_file(f);
fclose(f);
read_reqs(lu);
return EXIT_SUCCESS;
}
To compile use
gcc -lreadline -o digitfilter digitfilter.c
And test run:
$ cat data.txt
300303334
53210234
123456789
5374576807063874
$ ./digitfilter data.txt
> 034
mask: 034
strings: 300303334
> 0,1,2,3,4,5
mask: 012345
strings: 53210234, 300303334
> 0345678
mask: 0345678
strings: 5374576807063874, 300303334
Put each value into a set-- Eg.: '300303334'={3, 0, 4}.
Since the length of your data items are bound by a constant (50),
you can do these at O(1) time for each item using Java HashSet. The overall complexity of this phase adds up to O(n).
For each filter set, use containsAll() of HashSet to see whether
each of these data items is a subset of your filter. Takes O(n).
Takes O(m*n) in the overall where n is the number of data items and m the number of filters.

custom ITOA not working right?

I wanted to make a custom ITOA function to put large numbers into small strings, this is what I have coded :
main(){
printf("itoa(2000000000,36)= '%s'",itoa(2000000000,36));
printf("itoa(36,36)= '%s'",itoa(36,36));
printf("itoa(37,36)= '%s'",itoa(37,36));
return 1;
}
stock itoa(val, base)
{
new buf[1024] = {0,...};
new i = 1023;
new LETTERZ[37] = {'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',0};
for(; val && i; --i, val /= base)
buf[i] = LETTERZ[val % base];
return buf[i+1];
}
It's based on 'C' code from this page: http://www.jb.man.ac.uk/~slowe/cpp/itoa.html
But somehow this is the output:
[20:34:35] itoa(2000000000,36)= 'X'
[20:34:35] itoa(36,36)= '1'
[20:34:35] itoa(37,36)= '1'
And this is totally wrong, I don't know which output to expect but 36 and 37 for sure can't be the same output and 2 000 000 000 can't be just 'X', as X is suposed to be 35, not 2 000 000 000,
ZZ should be 1295 I think... I want to base this on the hexadecimal system, but with all the alfabet letters.
Could anyone tell me what's wrong here?
I'm working with a typeless language called PAWN (also known as SMALL) and later i want to use this code in VB.NET
/* itoa example */
#include <stdio.h>
#include <stdlib.h>
int main ()
{
int i;
char buffer [33];
printf ("Enter a number: ");
scanf ("%d",&i);
itoa (i,buffer,10);
printf ("decimal: %s\n",buffer);
itoa (i,buffer,16);
printf ("hexadecimal: %s\n",buffer);
itoa (i,buffer,2);
printf ("binary: %s\n",buffer);
return 0;
}
You only give the number and the base, but parameter 2 needs a pointer to char already allocated. Use a buffer or try NULL, so the function will return the result.
THe solution seemed to be simple, the return buf[i+1] just returned one character so what I did is make it return an array:
new _s#T[4096];
#define sprintf(%1) (format(_s#T, SPRINTF_MAX_STRING, %1), _s#T)
main(){
new num = atoi("ABCDEFG",36);
printf("%d",num);
printf("%s",itoa(num,36));
return 1;
}
stock itoa(val, base)
{
new buf[1024] = {0,...};
new LETTERZ[37] = {'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',0};
for(new pos = 0; val;++pos,val = floatround(val/base,floatround_floor))
strins(buf,sprintf("%c",LETTERZ[val % base]),0);
return buf;
}
stock atoi(val[], base)
{
new CURRNUM = 0;
new len = strlen(val);
new LETTERZ[37] = {'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',0};
for(new i = 0; i < len; ++i)
{
for(new x = 0; x < base; ++x)
{
new y = (len-i)-1;
if(val[y] == LETTERZ[x])
{
CURRNUM += x*floatround(floatpower(base,i));
}
}
}
return CURRNUM;
}

Basic operator overloading in D (Part 2)

Using Tango with D1:
class C
{
private int j;
public int opBinary(char[] op: "+") (ref C x) { return 1; }
public int opBinary(char[] op: "+") (C x) { return 3; }
}
int opBinary(char[] op: "+") (ref C x, ref C y) { return 2; }
int opBinary(char[] op: "+") (C x, C y) { return 2; }
void main() {
C a = new C;
C b = new C;
int j = a + b;
}
Compiler error:
"incompatible types"
meaning the overloaded operators weren't matched.
Can't wait to get the hang of D.
Thanks much.
OH Yea: I'm using Tango with D1, so maybe that's why it's not working? I'd like to stick with Tango. Has anyone used Tango + D2?
In D1 templated operator overloading using opBinary, etc. doesn't work. You need to use opAdd, opSub, etc.

Resources