Any faster way to replace substring in AWK - bash

I have a long string of about 50,000,000 long... , and I am substituting it part by part
cat FILE | tail -n+2 | awk -v k=100 '{
i = 1
while (i<length($0)-k+1) {
x = substr($0, i, k)
if (CONDITION) {
x changed sth
$0 = substr($0,1,i-1) x substr($0,i+k)
}
i += 1
}
gsub(sth,sth,$0)
printf("%s",$0) >> FILE
}'
Are there any ways to replace $0 at position i with x of length k without using this method?
The string is too long and the commands runs extremely slow
sample input:
NNNNNNNNNNggcaaacagaatccagcagcacatcaaaaagcttatccacAGTAATTCATTATATCAAAATGCTCCAggccaggcgtggtggcttatgcc
sample output:
NNNNNNNNNNggcnnncngnnnccngcngcncnncnnnnngcnnnnccncNGNNNNNCNNNNNNNCNNNNNGCNCCNggccnggcgnggnggcnnnngcc
If substring with length k=10 contains >50% of A || a || T || t
(so there are length($0)-k+1 substrings)
substitute A and T with N, a and t with n
The $0 string must maintain it size and sequence (Case sensitive)
EDIT:
I misunderstood the requirement of this problem, and repost the question at here.

Basically:
read a window of characters to two buffers - scratch buffer and output buffer
if in the scratch buffer there are more then some count of characters ATat
then replace all characters ATat in the output buffer buffer to Nn respectively
output one character from the output buffer
flush one character in both buffers
and go to step 1 to repeat reading the characters into buffers
when the end of line is encountered, just flush output buffer and reset it all
A small C program for sure is going to be the fastest:
// The window size
#define N 10
// The percent of the window that has to be equal to one of [AaTt]
#define PERCENT 50
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
// output a string
static void output(char *outme, size_t n) {
fwrite(outme, n, 1, stdout);
}
// is one of [AaTt]
static bool is_one_of_them(char c) {
switch(c) {
case 'A':
case 'a':
case 'T':
case 't':
return true;
}
return false;
}
// Convert one of characters to n/N depending on case
static char convert_them_to_n(char c) {
// switch(c){ case 'T': case 'A': return true; } return false;
// ASCII is assumed
const char m = ~0x1f;
const char w = 'n' & ~m;
return (c & m) | w;
}
static const unsigned threshold = N * PERCENT / 100;
// Store the input in buf
static char buf[N];
// Store the output to-be-outputted in out
static char out[N];
// The current position in buf and out
// The count of readed characters
static size_t pos;
// The count of one of searched characters in buf
static unsigned count_them;
static void buf_reset(void) {
pos = 0;
count_them = 0;
}
static void buf_flush(void) {
output(out, pos);
buf_reset();
}
static void buf_replace_them(void) {
// TODO: this could keep count of characters alrady replaced in out to save CPU
for (size_t i = 0; i < N; ++i) {
if (is_one_of_them(out[i])) {
out[i] = convert_them_to_n(out[i]);
}
}
}
static void buf_flush_one(void) {
assert(pos > 0);
assert(pos == N);
output(out, 1);
count_them -= is_one_of_them(buf[0]);
memmove(buf, buf + 1, pos - 1);
memmove(out, out + 1, pos - 1);
pos--;
}
static void buf_add(char c) {
buf[pos] = out[pos] = c;
pos++;
count_them += is_one_of_them(c);
// if we reached the substring length
if (pos == N) {
// if the count reached the threshold
if (count_them >= threshold) {
// convert the characters to n
buf_replace_them();
}
// flush one character only at a time
buf_flush_one();
}
}
int main() {
int c;
buf_reset();
while ((c = getchar()) != EOF) {
if (c == '\n') {
// If its a newline, just flush what we have buffered
buf_flush();
output("\n", 1);
continue;
}
buf_add(c);
}
buf_flush();
}
Such a C program is easily transferable to for example an awk script, just one need to read one character at a time. Below I split the characters with split, like:
awk -v N=10 -v percent=50 '
BEGIN{ threshold = N * percent / 100; pos=0 }
function is_one_of_them(c) {
return c ~ /^[aAtT]$/;
}
function buf_flush(i) {
for (i = 0; i < pos; ++i) {
printf "%s", out[i]
}
pos = 0
count_them = 0
}
function buf_replace_them(i) {
for (i = 0; i < pos; ++i) {
if (is_one_of_them(out[i])) {
out[i] = out[i] ~ /[AT]/ ? "N" : "n";
}
}
}
function buf_flush_one(i) {
printf "%s", out[0]
count_them -= is_one_of_them(buf[0])
if(0 && debug) {
printf(" count_them %s ", count_them)
for (i = 0; i < pos-1; ++i) {
printf("%s", buf[i+1])
} printf(" ");
for (i = 0; i < pos-1; ++i) {
printf("%s", out[i+1])
}
printf("\n");
}
for (i = 0; i < pos-1; ++i) {
buf[i] = buf[i+1]
out[i] = out[i+1]
}
pos--
}
function buf_add(c) {
buf[pos]=c; out[pos]=c; pos++
count_them += is_one_of_them(c)
if (pos == N) {
if (count_them >= threshold) {
buf_replace_them()
}
buf_flush_one()
}
}
{
split($0, chars, "")
for (idx = 0; idx <= length($0); idx++) {
buf_add(chars[idx])
}
buf_flush();
printf "\n";
}
'
Both programs when run with the input presented in the first line produce the output presented in the second line (note that lone a near the end is not replaced, because there are no 5 charactets ATat in a window of 10 characters from it):
NNNNNNNNNNggcaaacagaatccagcagcacatcaaaaagcttatccacAGTAATTCATTATATCAAAATGCTCCAggccaggcgtggtggcttatgcc
NNNNNNNNNNggcnnncngnnnccngcngcncnncnnnnngcnnnnccncNGNNNNNCNNNNNNNCNNNNNGCNCCNggccaggcgnggnggcnnnngcc
Both solutions were tested on repl.

You need to be careful with how you address this problem. You cannot work on the substituted string. You need to keep track of the original string. Here is a simple example. Assume we have a string consisting of x and y and we want to replace all y with z if there are 8 y in a substring of 10. Imagine your input looks like:
yyyyyyyyxxy
The first substring of 10 reads yyyyyyyyxx and would be translated into zzzzzzzzxx. If you perform the substitution directly into the original string, you get zzzzzzzzxxy. The second substring now reads zzzzzzzxxy, and does not contain 8 times y, while in the original string it does. So according to the solution of the OP, this would lead into inconsistent results, depending on if you start from the front or the back. So a quick solution would be:
awk -v N=10 -v p=50 '
BEGIN { n = N*p/100 }
{ s = $0 }
{ for(i=1;i<=length-N;++i) {
str=substr($0,i,N)
c=gsub(/[AT]/,"N",str) + gsub(/[at]/,"n",str)
if(c >= n) s = substr(s,1,i-1) str substr(s,i+N)
}
}
{ print s }' file
There is ofcourse quite some work you do double here. Imagine you have a string of the form xxyyyyyyyyxx, you would perform 4 concatinations while you only need to do one. So the best idea is to minimalise the work and only check the substrings which end with the respective character:
awk -v N=10 -v p=50 '
BEGIN { n = N*p/100 }
{ s = $0 }
{ i=N; while (match(substr($0,i),/[ATat]/)) {
str=substr($0,i+RSTART-N,N)
c=gsub(/[AT]/,"N",str) + gsub(/[at]/,"n",str)
if(c >= n) { s = substr(s,1,i+RSTART-N-1) str substr(s,i+RSTART)}
i=i+RSTART
}
}
{ print s }' file

To replace $0 at position i with x do:
awk 'BEGIN{i=12345;x="blubber"}
{
printf("%s",substr($0,1,i));
printf("%s",x);
printf("%s",substr($0,i+length(x)));
}'
I don't think there is any faster method.
To replace AGCT with N and agct with n use tr. To replace them only within a range and using awk you should do:
awk 'BEGIN{i=12345;n=123}
{
printf("%s",substr($0,1,i-1));
printf(gsub(/[atgc]/,"n",gsub(/[ATGC]/,"N",substr($0,i,i+n-1))));
printf("%s",substr($0,i+n));
}'
To do more advanced and faster processing you should consider c/c++.

Related

Filter only digit sequences containing a given set of digits

I have a large list of digit strings like this one. The individual strings are relatively short (say less than 50 digits).
data = [
'300303334',
'53210234',
'123456789',
'5374576807063874'
]
I need to find out a efficient data structure (speed first, memory second) and algorithm which returns only those strings that are composed of a given set of digits.
Example results:
filter(data, [0,3,4]) = ['300303334']
filter(data, [0,1,2,3,4,5]) = ['300303334', '53210234']
The data list will usually fit into memory.
For each digit, precompute a postings list that don't contain the digit.
postings = [[] for _ in xrange(10)]
for i, d in enumerate(data):
for j in xrange(10):
digit = str(j)
if digit not in d:
postings[j].append(i)
Now, to find all strings that contain, for example, just the digits [1, 3, 5] you can merge the postings lists for the other digits (ie: 0, 2, 4, 6, 7, 8, 9).
def intersect_postings(p0, p1):
i0, i1 = next(p0), next(p1)
while True:
if i0 == i1:
yield i0
i0, i1 = next(p0), next(p1)
elif i0 < i1: i0 = next(p0)
else: i1 = next(p1)
def find_all(digits):
p = None
for d in xrange(10):
if d not in digits:
if p is None: p = iter(postings[d])
else: p = intersect_postings(p, iter(postings[d]))
return (data[i] for i in p) if p else iter(data)
print list(find_all([0, 3, 4]))
print list(find_all([0, 1, 2, 3, 4, 5]))
A string can be encoded by a 10-bit number. There are 2^10, or 1,024 possible values.
So create a dictionary that uses an integer for a key and a list of strings for the value.
Calculate the value for each string and add that string to the list of strings for that value.
General idea:
Dictionary Lookup;
for each (string in list)
value = 0;
for each character in string
set bit N in value, where N is the character (0-9)
Lookup[value] += string // adds string to list for this value in dictionary
Then, to get a list of the strings that match your criteria, just compute the value and do a direct dictionary lookup.
So if the user asks for strings that contain only 3, 5, and 7:
value = (1 << 3) || (1 << 5) || (1 << 7);
list = Lookup[value];
Note that, as Matt pointed out in comment below, this will only return strings that contain all three digits. So, for example, it wouldn't return 37. That seems like a fatal flaw to me.
Edit
If the number of symbols you have to deal with is very large, then the number of possible combinations becomes too large for this solution to be practical.
With a large number of symbols, I'd recommend an inverted index as suggested in the comments, combined with a secondary filter that removes the strings that contain extraneous digits.
Consider a function f which constructs a bitmask for each string with bit i set if digit i is in the string.
For example,
f('0') = 0b0000000001
f('00') = 0b0000000001
f('1') = 0b0000000010
f('1100') = 0b0000000011
Then I suggest storing a list of strings for each bitmask.
For example,
Bitmask 0b0000000001 -> ['0','00']
Once you have prepared this data structure (which is the same size as your original list), you can then easily access all the strings for a particular filter by accessing all lists where the bitmask is a subset of the digits in your filter.
So for your example of filter [0,3,4] you would return the lists from:
Strings containing just 0
Strings containing just 3
Strings containing just 4
Strings containing 0 and 3
Strings containing 0 and 4
Strings containing 3 and 4
Strings containing 0 and 3 and 4
Example Python Code
from collections import defaultdict
import itertools
raw_data = [
'300303334',
'53210234',
'123456789',
'5374576807063874'
]
def preprocess(raw_data):
data = defaultdict(list)
for s in raw_data:
bitmask = 0
for digit in s:
bitmask |= 1<<int(digit)
data[bitmask].append(s)
return data
def filter(data,mask):
for r in range(len(mask)):
for m in itertools.combinations(mask,r+1):
bitmask = sum(1<<digit for digit in m)
for s in data[bitmask]:
yield s
data = preprocess(raw_data)
for a in filter(data, [0,1,2,3,4,5]):
print a
Just for kicks, I have coded up Jim's lovely algorithm and the Perl is here if anyone wants to play with it. Please do not accept this as an answer or anything, pass all credit to Jim:
#!/usr/bin/perl
use strict;
use warnings;
my $Debug=1;
my $Nwords=1000;
my ($word,$N,$value,$i,$j,$k);
my (#dictionary,%Lookup);
################################################################################
# Generate "words" with random number of characters 5-30
################################################################################
print "DEBUG: Generating $Nwords word dictionary\n" if $Debug;
for($i=0;$i<$Nwords;$i++){
$j = rand(25) + 5; # length of this word
$word="";
for($k=0;$k<$j;$k++){
$word = $word . int(rand(10));
}
$dictionary[$i]=$word;
print "$word\n" if $Debug;
}
# Add some obvious test cases
$dictionary[++$i]="0" x 50;
$dictionary[++$i]="1" x 50;
$dictionary[++$i]="2" x 50;
$dictionary[++$i]="3" x 50;
$dictionary[++$i]="4" x 50;
$dictionary[++$i]="5" x 50;
$dictionary[++$i]="6" x 50;
$dictionary[++$i]="7" x 50;
$dictionary[++$i]="8" x 50;
$dictionary[++$i]="9" x 50;
$dictionary[++$i]="0123456789";
################################################################################
# Encode words
################################################################################
for $word (#dictionary){
$value=0;
for($i=0;$i<length($word);$i++){
$N=substr($word,$i,1);
$value |= 1 << $N;
}
push(#{$Lookup{$value}},$word);
print "DEBUG: $word encoded as $value\n" if $Debug;
}
################################################################################
# Do lookups
################################################################################
while(1){
print "Enter permitted digits, separated with commas: ";
my $line=<STDIN>;
my #digits=split(",",$line);
$value=0;
for my $d (#digits){
$value |= 1<<$d;
}
print "Value: $value\n";
print join(", ",#{$Lookup{$value}}),"\n\n" if defined $Lookup{$value};
}
I like Jim Mischel's approach. It has pretty efficient look up and bounded memory usage. Code in C follows:
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <readline/readline.h>
#include <readline/history.h>
enum {
zero = '0',
nine = '9',
numbers = nine - zero + 1,
masks = 1 << numbers,
};
typedef uint16_t mask;
struct list {
char *s;
struct list *next;
};
typedef struct list list_cell;
typedef struct list *list;
static inline int is_digit(char c) { return c >= zero && c <= nine; }
static inline mask char2mask(char c) { return 1 << (c - zero); }
static inline mask add_char2mask(mask m, char c) {
return m | (is_digit(c) ? char2mask(c) : 0);
}
static inline int is_set(mask m, mask n) { return (m & n) != 0; }
static inline int is_set_char(mask m, char c) { return is_set(m, char2mask(c)); }
static inline int is_submask(mask sub, mask m) { return (sub & m) == sub; }
static inline char *sprint_mask(char buf[11], mask m) {
char *s = buf;
char i;
for(i = zero; i <= nine; i++)
if(is_set_char(m, i)) *s++ = i;
*s = 0;
return buf;
}
static inline mask get_mask(char *s) {
mask m=0;
for(; *s; s++)
m = add_char2mask(m, *s);
return m;
}
static inline int is_empty(list l) { return !l; }
static inline list insert(list *l, char *s) {
list cell = (list)malloc(sizeof(list_cell));
cell->s = s;
cell->next = *l;
return *l = cell;
}
static void *foreach(void *f(char *, void *), list l, void *init) {
for(; !is_empty(l); l = l->next)
init = f(l->s, init);
return init;
}
struct printer_state {
int first;
FILE *f;
};
static void *prin_list_member(char *s, void *data) {
struct printer_state *st = (struct printer_state *)data;
if(st->first) {
fputs(", ", st->f);
} else
st->first = 1;
fputs(s, st->f);
return data;
}
static void print_list(list l) {
struct printer_state st = {.first = 0, .f = stdout};
foreach(prin_list_member, l, (void *)&st);
putchar('\n');
}
static list *init_lu(void) { return (list *)calloc(sizeof(list), masks); }
static list *insert2lu(list lu[masks], char *s) {
mask i, m = get_mask(s);
if(m) // skip string without any number
for(i = m; i < masks; i++)
if(is_submask(m, i))
insert(lu+i, s);
return lu;
}
int usage(const char *name) {
fprintf(stderr, "Usage: %s filename\n", name);
return EXIT_FAILURE;
}
#define handle_error(msg) \
do { perror(msg); exit(EXIT_FAILURE); } while (0)
static inline void chomp(char *s) { if( (s = strchr(s, '\n')) ) *s = '\0'; }
list *load_file(FILE *f) {
char *line = NULL;
size_t len = 0;
ssize_t read;
list *lu = init_lu();
for(; (read = getline(&line, &len, f)) != -1; line = NULL) {
chomp(line);
insert2lu(lu, line);
}
return lu;
}
void read_reqs(list *lu) {
char *line;
char buf[11];
for(; (line = readline("> ")); free(line))
if(*line) {
add_history(line);
mask m = get_mask(line);
printf("mask: %s\nstrings: ", sprint_mask(buf, m));
print_list(lu[m]);
};
putchar('\n');
}
int main(int argc, const char* argv[] ) {
const char *name = argv[0];
FILE *f;
list *lu;
if(argc != 2) return usage(name);
f = fopen(argv[1], "r");
if(!f) handle_error("open");
lu = load_file(f);
fclose(f);
read_reqs(lu);
return EXIT_SUCCESS;
}
To compile use
gcc -lreadline -o digitfilter digitfilter.c
And test run:
$ cat data.txt
300303334
53210234
123456789
5374576807063874
$ ./digitfilter data.txt
> 034
mask: 034
strings: 300303334
> 0,1,2,3,4,5
mask: 012345
strings: 53210234, 300303334
> 0345678
mask: 0345678
strings: 5374576807063874, 300303334
Put each value into a set-- Eg.: '300303334'={3, 0, 4}.
Since the length of your data items are bound by a constant (50),
you can do these at O(1) time for each item using Java HashSet. The overall complexity of this phase adds up to O(n).
For each filter set, use containsAll() of HashSet to see whether
each of these data items is a subset of your filter. Takes O(n).
Takes O(m*n) in the overall where n is the number of data items and m the number of filters.

Converting lower/upper case letters without ctype.h

I just saw that this could technically work, the only mistake I couldn´t resolve was the last ASCII character that gets printed everytime I test it out, I also tested this out without using the name variable, I mean just making a substraction of 32 to any lower case letter in ASCII should give me their upper case one and it does, but I´m curious on why I´m getting an additional char, wich from what I see in screen is apparently Û.
#include <stdio.h>
main()
{
char name[22];
int i;
fputs("Type your name ",stdout);
fgets(name,22,stdin);
for (i = 0; name[i] != '\0'; i = i + 1)
printf("%c",(name[i])-32); /*This will convert lower case to upper */
/* using as reference the ASCII table*/
fflush(stdin);
getchar();
}
Perhaps there is a line break character at the end of the string.
You can check the chararacter code, so that you only convert characters that actually are lower case letters:
for (i = 0; name[i] != '\0'; i = i + 1) {
char c = name[i];
if (c => 97 && c <= 122) {
c -= 32;
}
printf("%c", c);
}
void read_chararray(char in_array[], int* Length)
{
int Indx = 0, Indx2 = 0, Indx3 = 0; // int declarations for indexs of some loops
char cinput = { 0 }, word[255] = { 0 }, word2[255] = { 0 }; // declaration of cinput and first char array before punctiation removed
for (Indx = 0; (cinput = getchar()) != '\n'; Indx++) { // Loop for getting characters from user stop at <enter>
word[Indx] = cinput; // Placing char into array while changing to lowercase
}
Indx2 = Indx; // Set Indx2 to Indx for loop operation
for (Indx = 0; Indx < Indx2; Indx++) { // Loop to check and replace upper characters with lower
cinput = word[Indx];
if (cinput >= 65 && cinput <= 90) { // If cinput is within the ASCII range 65 and 90, this indicates upper characters
cinput += 32; // Add 32 to cinput to shift to the lower character range within the ASCII table
in_array[Indx] = cinput; // Input new value into array pointer
}
else if (cinput >= 97 && cinput <= 122) // scans if character are lower ASCII, places them in array irraticating punctuation and whitespce
in_array[Indx] = cinput; // Input remaining lower case into array pointer
}
*Length = Indx; // final size of array set to Length variable for future use
}
#include<stdio.h>
void upper(char);
void main()
{
char ch;
printf("\nEnter the character in lower case");
scanf("%c", &ch);
upper(ch);
}
void upper( char c)
{
printf("\nUpper Case: %c", c-32);
}

custom ITOA not working right?

I wanted to make a custom ITOA function to put large numbers into small strings, this is what I have coded :
main(){
printf("itoa(2000000000,36)= '%s'",itoa(2000000000,36));
printf("itoa(36,36)= '%s'",itoa(36,36));
printf("itoa(37,36)= '%s'",itoa(37,36));
return 1;
}
stock itoa(val, base)
{
new buf[1024] = {0,...};
new i = 1023;
new LETTERZ[37] = {'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',0};
for(; val && i; --i, val /= base)
buf[i] = LETTERZ[val % base];
return buf[i+1];
}
It's based on 'C' code from this page: http://www.jb.man.ac.uk/~slowe/cpp/itoa.html
But somehow this is the output:
[20:34:35] itoa(2000000000,36)= 'X'
[20:34:35] itoa(36,36)= '1'
[20:34:35] itoa(37,36)= '1'
And this is totally wrong, I don't know which output to expect but 36 and 37 for sure can't be the same output and 2 000 000 000 can't be just 'X', as X is suposed to be 35, not 2 000 000 000,
ZZ should be 1295 I think... I want to base this on the hexadecimal system, but with all the alfabet letters.
Could anyone tell me what's wrong here?
I'm working with a typeless language called PAWN (also known as SMALL) and later i want to use this code in VB.NET
/* itoa example */
#include <stdio.h>
#include <stdlib.h>
int main ()
{
int i;
char buffer [33];
printf ("Enter a number: ");
scanf ("%d",&i);
itoa (i,buffer,10);
printf ("decimal: %s\n",buffer);
itoa (i,buffer,16);
printf ("hexadecimal: %s\n",buffer);
itoa (i,buffer,2);
printf ("binary: %s\n",buffer);
return 0;
}
You only give the number and the base, but parameter 2 needs a pointer to char already allocated. Use a buffer or try NULL, so the function will return the result.
THe solution seemed to be simple, the return buf[i+1] just returned one character so what I did is make it return an array:
new _s#T[4096];
#define sprintf(%1) (format(_s#T, SPRINTF_MAX_STRING, %1), _s#T)
main(){
new num = atoi("ABCDEFG",36);
printf("%d",num);
printf("%s",itoa(num,36));
return 1;
}
stock itoa(val, base)
{
new buf[1024] = {0,...};
new LETTERZ[37] = {'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',0};
for(new pos = 0; val;++pos,val = floatround(val/base,floatround_floor))
strins(buf,sprintf("%c",LETTERZ[val % base]),0);
return buf;
}
stock atoi(val[], base)
{
new CURRNUM = 0;
new len = strlen(val);
new LETTERZ[37] = {'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',0};
for(new i = 0; i < len; ++i)
{
for(new x = 0; x < base; ++x)
{
new y = (len-i)-1;
if(val[y] == LETTERZ[x])
{
CURRNUM += x*floatround(floatpower(base,i));
}
}
}
return CURRNUM;
}

Finding shortest repeating cycle in word?

I'm about to write a function which, would return me a shortest period of group of letters which would eventually create the given word.
For example word abkebabkebabkeb is created by repeated abkeb word. I would like to know, how efficiently analyze input word, to get the shortest period of characters creating input word.
Here is a correct O(n) algorithm. The first for loop is the table building portion of KMP. There are various proofs that it always runs in linear time.
Since this question has 4 previous answers, none of which are O(n) and correct, I heavily tested this solution for both correctness and runtime.
def pattern(inputv):
if not inputv:
return inputv
nxt = [0]*len(inputv)
for i in range(1, len(nxt)):
k = nxt[i - 1]
while True:
if inputv[i] == inputv[k]:
nxt[i] = k + 1
break
elif k == 0:
nxt[i] = 0
break
else:
k = nxt[k - 1]
smallPieceLen = len(inputv) - nxt[-1]
if len(inputv) % smallPieceLen != 0:
return inputv
return inputv[0:smallPieceLen]
O(n) solution. Assumes that the entire string must be covered. The key observation is that we generate the pattern and test it, but if we find something along the way that doesn't match, we must include the entire string that we already tested, so we don't have to reobserve those characters.
def pattern(inputv):
pattern_end =0
for j in range(pattern_end+1,len(inputv)):
pattern_dex = j%(pattern_end+1)
if(inputv[pattern_dex] != inputv[j]):
pattern_end = j;
continue
if(j == len(inputv)-1):
print pattern_end
return inputv[0:pattern_end+1];
return inputv;
This is an example for PHP:
<?php
function getrepeatedstring($string) {
if (strlen($string)<2) return $string;
for($i = 1; $i<strlen($string); $i++) {
if (substr(str_repeat(substr($string, 0, $i),strlen($string)/$i+1), 0, strlen($string))==$string)
return substr($string, 0, $i);
}
return $string;
}
?>
Most easiest one in python:
def pattern(self, s):
ans=(s+s).find(s,1,-1)
return len(pat) if ans == -1 else ans
I believe there is a very elegant recursive solution. Many of the proposed solutions solve the extra complexity where the string ends with part of the pattern, like abcabca. But I do not think that is asked for.
My solution for the simple version of the problem in clojure:
(defn find-shortest-repeating [pattern string]
(if (empty? (str/replace string pattern ""))
pattern
(find-shortest-repeating (str pattern (nth string (count pattern))) string)))
(find-shortest-repeating "" "abcabcabc") ;; "abc"
But be aware that this will not find patterns that are uncomplete at the end.
I found a solution based on your post, that could take an incomplete pattern:
(defn find-shortest-repeating [pattern string]
(if (or (empty? (clojure.string/split string (re-pattern pattern)))
(empty? (second (clojure.string/split string (re-pattern pattern)))))
pattern
(find-shortest-repeating (str pattern (nth string (count pattern))) string)))
My Solution:
The idea is to find a substring from the position zero such that it becomes equal to the adjacent substring of same length, when such a substring is found return the substring. Please note if no repeating substring is found I am printing the entire input String.
public static void repeatingSubstring(String input){
for(int i=0;i<input.length();i++){
if(i==input.length()-1){
System.out.println("There is no repetition "+input);
}
else if(input.length()%(i+1)==0){
int size = i+1;
if(input.substring(0, i+1).equals(input.substring(i+1, i+1+size))){
System.out.println("The subString which repeats itself is "+input.substring(0, i+1));
break;
}
}
}
}
This is a solution I came up with using the queue, it passed all the test cases of a similar problem in codeforces. Problem No is 745A.
#include<bits/stdc++.h>
using namespace std;
typedef long long ll;
int main()
{
ios_base::sync_with_stdio(false);
cin.tie(NULL);
string s, s1, s2; cin >> s; queue<char> qu; qu.push(s[0]); bool flag = true; int ind = -1;
s1 = s.substr(0, s.size() / 2);
s2 = s.substr(s.size() / 2);
if(s1 == s2)
{
for(int i=0; i<s1.size(); i++)
{
s += s1[i];
}
}
//cout << s1 << " " << s2 << " " << s << "\n";
for(int i=1; i<s.size(); i++)
{
if(qu.front() == s[i]) {qu.pop();}
qu.push(s[i]);
}
int cycle = qu.size();
/*queue<char> qu2 = qu; string str = "";
while(!qu2.empty())
{
cout << qu2.front() << " ";
str += qu2.front();
qu2.pop();
}*/
while(!qu.empty())
{
if(s[++ind] != qu.front()) {flag = false; break;}
qu.pop();
}
flag == true ? cout << cycle : cout << s.size();
return 0;
}
Simpler answer which I can come up in an interview is just a O(n^2) solution, which tries out all combinations of substring starting from 0.
int findSmallestUnit(string str){
for(int i=1;i<str.length();i++){
int j=0;
for(;j<str.length();j++){
if(str[j%i] != str[j]){
break;
}
}
if(j==str.length()) return str.substr(0,i);
}
return str;
}
Now if someone is interested in O(n) solution to this problem in c++:
int findSmallestUnit(string str){
vector<int> lps(str.length(),0);
int i=1;
int len=0;
while(i<str.length()){
if(str[i] == str[len]){
len++;
lps[i] = len;
i++;
}
else{
if(len == 0) i++;
else{
len = lps[len-1];
}
}
}
int n=str.length();
int x = lps[n-1];
if(n%(n-x) == 0){
return str.substr(0,n-x);
}
return str;
}
The above is just #Buge's answer in c++, since someone asked in comments.
Regex solution:
Use the following regex replacement to find the shortest repeating substring, and only keeping that substring:
^(.+?)\1*$
$1
Explanation:
^(.+?)\1*$
^ $ # Start and end, to match the entire input-string
( ) # Capture group 1:
.+ # One or more characters,
? # with a reluctant instead of greedy match†
\1* # Followed by the first capture group repeated zero or more times
$1 # Replace the entire input-string with the first capture group match,
# removing all other duplicated substrings
† Greedy vs reluctant would in this case mean: greedy = consumes as many characters as it can; reluctant = consumes as few characters as it can. Since we want the shortest repeating substring, we would want a reluctant match in our regex.
Example input: "abkebabkebabkeb"
Example output: "abkeb"
Try it online in Retina.
Here an example implementation in Java.
Super delayed answer, but I got the question in an interview, here was my answer (probably not the most optimal but it works for strange test cases as well).
private void run(String[] args) throws IOException {
File file = new File(args[0]);
BufferedReader buffer = new BufferedReader(new FileReader(file));
String line;
while ((line = buffer.readLine()) != null) {
ArrayList<String> subs = new ArrayList<>();
String t = line.trim();
String out = null;
for (int i = 0; i < t.length(); i++) {
if (t.substring(0, t.length() - (i + 1)).equals(t.substring(i + 1, t.length()))) {
subs.add(t.substring(0, t.length() - (i + 1)));
}
}
subs.add(0, t);
for (int j = subs.size() - 2; j >= 0; j--) {
String match = subs.get(j);
int mLength = match.length();
if (j != 0 && mLength <= t.length() / 2) {
if (t.substring(mLength, mLength * 2).equals(match)) {
out = match;
break;
}
} else {
out = match;
}
}
System.out.println(out);
}
}
Testcases:
abcabcabcabc
bcbcbcbcbcbcbcbcbcbcbcbcbcbc
dddddddddddddddddddd
adcdefg
bcbdbcbcbdbc
hellohell
Code returns:
abc
bc
d
adcdefg
bcbdbc
hellohell
Works in cases such as bcbdbcbcbdbc.
function smallestRepeatingString(sequence){
var currentRepeat = '';
var currentRepeatPos = 0;
for(var i=0, ii=sequence.length; i<ii; i++){
if(currentRepeat[currentRepeatPos] !== sequence[i]){
currentRepeatPos = 0;
// Add next character available to the repeat and reset i so we don't miss any matches inbetween
currentRepeat = currentRepeat + sequence.slice(currentRepeat.length, currentRepeat.length+1);
i = currentRepeat.length-1;
}else{
currentRepeatPos++;
}
if(currentRepeatPos === currentRepeat.length){
currentRepeatPos = 0;
}
}
// If repeat wasn't reset then we didn't find a full repeat at the end.
if(currentRepeatPos !== 0){ return sequence; }
return currentRepeat;
}
I came up with a simple solution that works flawlessly even with very large strings.
PHP Implementation:
function get_srs($s){
$hash = md5( $s );
$i = 0; $p = '';
do {
$p .= $s[$i++];
preg_match_all( "/{$p}/", $s, $m );
} while ( ! hash_equals( $hash, md5( implode( '', $m[0] ) ) ) );
return $p;
}

How to find validity of a string of parentheses, curly brackets and square brackets?

I recently came in contact with this interesting problem. You are given a string containing just the characters '(', ')', '{', '}', '[' and ']', for example, "[{()}]", you need to write a function which will check validity of such an input string, function may be like this:
bool isValid(char* s);
these brackets have to close in the correct order, for example "()" and "()[]{}" are all valid but "(]", "([)]" and "{{{{" are not!
I came out with following O(n) time and O(n) space complexity solution, which works fine:
Maintain a stack of characters.
Whenever you find opening braces '(', '{' OR '[' push it on the stack.
Whenever you find closing braces ')', '}' OR ']' , check if top of stack is corresponding opening bracket, if yes, then pop the stack, else break the loop and return false.
Repeat steps 2 - 3 until end of the string.
This works, but can we optimize it for space, may be constant extra space, I understand that time complexity cannot be less than O(n) as we have to look at every character.
So my question is can we solve this problem in O(1) space?
With reference to the excellent answer from Matthieu M., here is an implementation in C# that seems to work beautifully.
/// <summary>
/// Checks to see if brackets are well formed.
/// Passes "Valid parentheses" challenge on www.codeeval.com,
/// which is a programming challenge site much like www.projecteuler.net.
/// </summary>
/// <param name="input">Input string, consisting of nothing but various types of brackets.</param>
/// <returns>True if brackets are well formed, false if not.</returns>
static bool IsWellFormedBrackets(string input)
{
string previous = "";
while (input.Length != previous.Length)
{
previous = input;
input = input
.Replace("()", String.Empty)
.Replace("[]", String.Empty)
.Replace("{}", String.Empty);
}
return (input.Length == 0);
}
Essentially, all it does is remove pairs of brackets until there are none left to remove; if there is anything left the brackets are not well formed.
Examples of well formed brackets:
()[]
{()[]}
Example of malformed brackets:
([)]
{()[}]
Actually, there's a deterministic log-space algorithm due to Ritchie and Springsteel: http://dx.doi.org/10.1016/S0019-9958(72)90205-7 (paywalled, sorry not online). Since we need log bits to index the string, this is space-optimal.
If you're willing to accept one-sided error, then there's an algorithm that uses n polylog(n) time and polylog(n) space: http://www.eccc.uni-trier.de/report/2009/119/
If the input is read-only, I don't think we can do O(1) space. It is a well known fact that any O(1) space decidable language is regular (i.e writeable as a regular expression). The set of strings you have is not a regular language.
Of course, this is about a Turing Machine. I would expect it to be true for fixed word RAM machines too.
Edit: Although simple, this algorithm is actually O(n^2) in terms of character comparisons. To demonstrate it, one can simply generate a string as '(' * n + ')' * n.
I have a simple, though perhaps erroneous idea, that I will submit to your criticisms.
It's a destructive algorithm, which means that if you ever need the string it would not help (since you would need to copy it down).
Otherwise, the algorithm work with a simple index within the current string.
The idea is to remove pairs one after the others:
([{}()])
([()])
([])
()
empty -> OK
It is based on the simple fact that if we have matching pairs, then at least one is of the form () without any pair character in between.
Algorithm:
i := 0
Find a matching pair from i. If none is found, then the string is not valid. If one is found, let i be the index of the first character.
Remove [i:i+1] from the string
If i is at the end of the string, and the string is not empty, it's a failure.
If [i-1:i] is a matching pair, i := i-1 and back to 3.
Else, back to 1.
The algorithm is O(n) in complexity because:
each iteration of the loop removes 2 characters from the string
the step 2., which is linear, is naturally bound (i cannot grow indefinitely)
And it's O(1) in space because only the index is required.
Of course, if you can't afford to destroy the string, then you'll have to copy it, and that's O(n) in space so no real benefit there!
Unless, of course, I am deeply mistaken somewhere... and perhaps someone could use the original idea (there is a pair somewhere) to better effect.
I doubt you'll find a better solution, since even if you use internal functions to regexp or count occurrences, they still have a O(...) cost. I'd say your solution is the best :)
To optimize for space you could do some run-length encoding on your stack, but I doubt it would gain you very much, except in cases like {{{{{{{{{{}}}}}}}}}}.
http://www.sureinterview.com/shwqst/112007
It is natural to solve this problem with a stack.
If only '(' and ')' are used, the stack is not necessary. We just need to maintain a counter for the unmatched left '('. The expression is valid if the counter is always non-negative during the match and is zero at the end.
In general case, although the stack is still necessary, the depth of the stack can be reduced by using a counter for unmatched braces.
This is an working java code where I filter out the brackets from the string expression and then check the well formedness by replacing wellformed braces by nulls
Sample input = (a+{b+c}-[d-e])+[f]-[g] FilterBrackets will output = ({}[])[][] Then I check for wellformedness.
Comments welcome.
public class ParanString {
public static void main(String[] args) {
String s = FilterBrackets("(a+{b+c}-[d-e])[][]");
while ((s.length()!=0) && (s.contains("[]")||s.contains("()")||s.contains("{}")))
{
//System.out.println(s.length());
//System.out.println(s);
s = s.replace("[]", "");
s = s.replace("()", "");
s = s.replace("{}", "");
}
if(s.length()==0)
{
System.out.println("Well Formed");
}
else
{
System.out.println("Not Well Formed");
}
}
public static String FilterBrackets(String str)
{
int len=str.length();
char arr[] = str.toCharArray();
String filter = "";
for (int i = 0; i < len; i++)
{
if ((arr[i]=='(') || (arr[i]==')') || (arr[i]=='[') || (arr[i]==']') || (arr[i]=='{') || (arr[i]=='}'))
{
filter=filter+arr[i];
}
}
return filter;
}
}
The following modification of Sbusidan's answer is O(n2) time complex but O(log n) space simple.
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
char opposite(char bracket) {
switch(bracket) {
case '[':
return ']';
case '(':
return ')';
}
}
bool is_balanced(int length, char *s) {
int depth, target_depth, index;
char target_bracket;
if(length % 2 != 0) {
return false;
}
for(target_depth = length/2; target_depth > 0; target_depth--) {
depth=0;
for(index = 0; index < length; index++) {
switch(s[index]) {
case '(':
case '[':
depth++;
if(depth == target_depth) target_bracket = opposite(s[index]);
break;
case ')':
case ']':
if(depth == 0) return false;
if(depth == target_depth && s[index] != target_bracket) return false;
depth--;
break;
}
}
}
}
void main(char* argv[]) {
char input[] = "([)[(])]";
char *balanced = is_balanced(strlen(input), input) ? "balanced" : "imbalanced";
printf("%s is %s.\n", input, balanced);
}
If you can overwrite the input string (not reasonable in the use cases I envision, but what the heck...) you can do it in constant space, though I believe the time requirement goes up to O(n2).
Like this:
string s = input
char c = null
int i=0
do
if s[i] isAOpenChar()
c = s[i]
else if
c = isACloseChar()
if closeMatchesOpen(s[i],c)
erase s[i]
while s[--i] != c ;
erase s[i]
c == null
i = 0; // Not optimal! It would be better to back up until you find an opening character
else
return fail
end if
while (s[++i] != EOS)
if c==null
return pass
else
return fail
The essence of this is to use the early part of the input as the stack.
I know I'm a little late to this party; it's also my very first post on StackOverflow.
But when I looked through the answers, I thought I might be able to come up with a better solution.
So my solution is to use a few pointers.
It doesn't even have to use any RAM storage, as registers can be used for this.
I have not tested the code; it's written it on the fly.
You'll need to fix my typos, and debug it, but I believe you'll get the idea.
Memory usage: Only the CPU registers in most cases.
CPU usage: It depends, but approximately twice the time it takes to read the string.
Modifies memory: No.
b: string beginning, e: string end.
l: left position, r: right position.
c: char, m: match char
if r reaches the end of the string, we have a success.
l goes backwards from r towards b.
Whenever r meets a new start kind, set l = r.
when l reaches b, we're done with the block; jump to beginning of next block.
const char *chk(const char *b, int len) /* option 2: remove int len */
{
char c, m;
const char *l, *r;
e = &b[len]; /* option 2: remove. */
l = b;
r = b;
while(r < e) /* option 2: change to while(1) */
{
c = *r++;
/* option 2: if(0 == c) break; */
if('(' == c || '{' == c || '[' == c)
{
l = r;
}
else if(')' == c || ']' == c || '}' == c)
{
/* find 'previous' starting brace */
m = 0;
while(l > b && '(' != m && '[' != m && '{' != m)
{
m = *--l;
}
/* now check if we have the correct one: */
if(((m & 1) + 1 + m) != c) /* cryptic: convert starting kind to ending kind and match with c */
{
return(r - 1); /* point to error */
}
if(l <= b) /* did we reach the beginning of this block ? */
{
b = r; /* set new beginning to 'head' */
l = b; /* obsolete: make left is in range. */
}
}
}
m = 0;
while(l > b && '(' != m && '[' != m && '{' != m)
{
m = *--l;
}
return(m ? l : NULL); /* NULL-pointer for OK */
}
After thinking about this approach for a while, I realized that it will not work as it is right now.
The problem will be that if you have "[()()]", it'll fail when reaching the ']'.
But instead of deleting the proposed solution, I'll leave it here, as it's actually not impossible to make it work, it does require some modification, though.
/**
*
* #author madhusudan
*/
public class Main {
/**
* #param args the command line arguments
*/
public static void main(String[] args) {
new Main().validateBraces("()()()()(((((())))))()()()()()()()()");
// TODO code application logic here
}
/**
* #Use this method to validate braces
*/
public void validateBraces(String teststr)
{
StringBuffer teststr1=new StringBuffer(teststr);
int ind=-1;
for(int i=0;i<teststr1.length();)
{
if(teststr1.length()<1)
break;
char ch=teststr1.charAt(0);
if(isClose(ch))
break;
else if(isOpen(ch))
{
ind=teststr1.indexOf(")", i);
if(ind==-1)
break;
teststr1=teststr1.deleteCharAt(ind).deleteCharAt(i);
}
else if(isClose(ch))
{
teststr1=deleteOpenBraces(teststr1,0,i);
}
}
if(teststr1.length()>0)
{
System.out.println("Invalid");
}else
{
System.out.println("Valid");
}
}
public boolean isOpen(char ch)
{
if("(".equals(Character.toString(ch)))
{
return true;
}else
return false;
}
public boolean isClose(char ch)
{
if(")".equals(Character.toString(ch)))
{
return true;
}else
return false;
}
public StringBuffer deleteOpenBraces(StringBuffer str,int start,int end)
{
char ar[]=str.toString().toCharArray();
for(int i=start;i<end;i++)
{
if("(".equals(ar[i]))
str=str.deleteCharAt(i).deleteCharAt(end);
break;
}
return str;
}
}
Instead of putting braces into the stack, you could use two pointers to check the characters of the string. one start from the beginning of the string and the other start from end of the string. something like
bool isValid(char* s) {
start = find_first_brace(s);
end = find_last_brace(s);
while (start <= end) {
if (!IsPair(start,end)) return false;
// move the pointer forward until reach a brace
start = find_next_brace(start);
// move the pointer backward until reach a brace
end = find_prev_brace(end);
}
return true;
}
Note that there are some corner case not handled.
I think that you can implement an O(n) algorithm. Simply you have to initialise an counter variable for each type: curly, square and normal brackets. After than you should iterate the string and should increase the coresponding counter if the bracket is opened, otherwise to decrease it. If the counter is negative return false. AfterI think that you can implement an O(n) algorithm. Simply you have to initialise an counter variable for each type: curly, square and normal brackets. After than you should iterate the string and should increase the coresponding counter if the bracket is opened, otherwise to decrease it. If the counter is negative return false. After you count all brackets, you should check if all counters are zero. In that case, the string is valid and you should return true.
You could provide the value and check if its a valid one, it would print YES otherwise it would print NO
static void Main(string[] args)
{
string value = "(((([{[(}]}]))))";
List<string> jj = new List<string>();
if (!(value.Length % 2 == 0))
{
Console.WriteLine("NO");
}
else
{
bool isValid = true;
List<string> items = new List<string>();
for (int i = 0; i < value.Length; i++)
{
string item = value.Substring(i, 1);
if (item == "(" || item == "{" || item == "[")
{
items.Add(item);
}
else
{
string openItem = items[items.Count - 1];
if (((item == ")" && openItem == "(")) || (item == "}" && openItem == "{") || (item == "]" && openItem == "["))
{
items.RemoveAt(items.Count - 1);
}
else
{
isValid = false;
break;
}
}
}
if (isValid)
{
Console.WriteLine("Yes");
}
else
{
Console.WriteLine("NO");
}
}
Console.ReadKey();
}
var verify = function(text)
{
var symbolsArray = ['[]', '()', '<>'];
var symbolReg = function(n)
{
var reg = [];
for (var i = 0; i < symbolsArray.length; i++) {
reg.push('\\' + symbolsArray[i][n]);
}
return new RegExp('(' + reg.join('|') + ')','g');
};
// openReg matches '(', '[' and '<' and return true or false
var openReg = symbolReg(0);
// closeReg matches ')', ']' and '>' and return true or false
var closeReg = symbolReg(1);
// nestTest matches openSymbol+anyChar+closeSymbol
// and returns an obj with the match str and it's start index
var nestTest = function(symbols, text)
{
var open = symbols[0]
, close = symbols[1]
, reg = new RegExp('(\\' + open + ')([\\s\\S])*(\\' + close + ')','g')
, test = reg.exec(text);
if (test) return {
start: test.index,
str: test[0]
};
else return false;
};
var recursiveCheck = function(text)
{
var i, nestTests = [], test, symbols;
// nestTest with each symbol
for (i = 0; i < symbolsArray.length; i++)
{
symbols = symbolsArray[i];
test = nestTest(symbols, text);
if (test) nestTests.push(test);
}
// sort tests by start index
nestTests.sort(function(a, b)
{
return a.start - b.start;
});
if (nestTests.length)
{
// build nest data: calculate match end index
for (i = 0; i < nestTests.length; i++)
{
test = nestTests[i];
var end = test.start + ( (test.str) ? test.str.length : 0 );
nestTests[i].end = end;
var last = (nestTests[i + 1]) ? nestTests[i + 1].index : text.length;
nestTests[i].pos = text.substring(end, last);
}
for (i = 0; i < nestTests.length; i++)
{
test = nestTests[i];
// recursive checks what's after the nest
if (test.pos.length && !recursiveCheck(test.pos)) return false;
// recursive checks what's in the nest
if (test.str.length) {
test.str = test.str.substring(1, test.str.length - 1);
return recursiveCheck(test.str);
} else return true;
}
} else {
// if no nests then check for orphan symbols
var closeTest = closeReg.test(text);
var openTest = openReg.test(text);
return !(closeTest || openTest);
}
};
return recursiveCheck(text);
};
Using c# OOPS programming... Small and simple solution
Console.WriteLine("Enter the string");
string str = Console.ReadLine();
int length = str.Length;
if (length % 2 == 0)
{
while (length > 0 && str.Length > 0)
{
for (int i = 0; i < str.Length; i++)
{
if (i + 1 < str.Length)
{
switch (str[i])
{
case '{':
if (str[i + 1] == '}')
str = str.Remove(i, 2);
break;
case '(':
if (str[i + 1] == ')')
str = str.Remove(i, 2);
break;
case '[':
if (str[i + 1] == ']')
str = str.Remove(i, 2);
break;
}
}
}
length--;
}
if(str.Length > 0)
Console.WriteLine("Invalid input");
else
Console.WriteLine("Valid input");
}
else
Console.WriteLine("Invalid input");
Console.ReadKey();
This is my solution to the problem.
O(n) is the complexity of time without complexity of space.
Code in C.
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
bool checkBraket(char *s)
{
int curly = 0, rounded = 0, squre = 0;
int i = 0;
char ch = s[0];
while (ch != '\0')
{
if (ch == '{') curly++;
if (ch == '}') {
if (curly == 0) {
return false;
} else {
curly--; }
}
if (ch == '[') squre++;
if (ch == ']') {
if (squre == 0) {
return false;
} else {
squre--;
}
}
if (ch == '(') rounded++;
if (ch == ')') {
if (rounded == 0) {
return false;
} else {
rounded--;
}
}
i++;
ch = s[i];
}
if (curly == 0 && rounded == 0 && squre == 0){
return true;
}
else {
return false;
}
}
void main()
{
char mystring[] = "{{{{{[(())}}]}}}";
int answer = checkBraket(mystring);
printf("my answer is %d\n", answer);
return;
}

Resources