Starting from a text that contains characters like \u00f9, \u00a0, \u00e8 I would like to replace them with the ascii equivalents ù, è, etc.
There is my current implementation, that for some reason every now and then it delete pieces of other words and I don't understand why:
pos1 = str2.find("\\u00a0");
pos2 = str2.find("\\u00");
pos3 = str2.find("\\u20");
pos4 = str2.find("\\r\\n");
while (pos1 != std::string::npos)
{
str2.replace(pos1, 6, "");
pos1 = str2.find("\\u00a0");
}
while (pos2 != std::string::npos)
{
str2.replace(pos2, 6, "?");
pos2 = str2.find("\\u00");
}
while (pos3 != std::string::npos)
{
str2.replace(pos3, 6, "?");
pos3 = str2.find("\\u20");
}
while (pos4 != std::string::npos)
{
str2.replace(pos4, 2, "\n");
pos4 = str2.find("\\r\\n");
}
and there's an example of the text:
William Shakespeare \u00e8 stato un drammaturgo e poeta inglese, considerato come il pi\u00f9 importante scrittore in inglese e generalmente ritenuto il pi\u00f9 eminente drammaturgo della cultura occidentale.\u00a0\r\n
One problem I see is that \u... character sequences are always 6 characters, but you are only looking for and replacing 4 characters leaving the other 2 characters in the string.
Also, you are searching for all of the sequences up front, but then you are not taking into account that any replacements the pos1 loop makes will invalidate pos2, pos3 and pos4. Then any replacements the pos2 loop makes will invalidate pos3 and pos4. And so on.
I would suggest a completely different parsing strategy:
#include <string>
#include <sstream>
#include <iomanip>
...
auto pos = str2.find('\\');
char ch;
while (pos != std::string::npos)
{
if (++pos == str2.size())
break;
switch (str2[pos])
{
case 'r':
ch = '\r';
str2.replace(pos-1, 2, &ch, 1);
/* or:
str2.replace(--pos, 2, "");
*/
break;
case 'n':
ch = '\n';
str2.replace(pos-1, 2, &ch, 1);
break;
case 't':
ch = '\t';
str2.replace(pos-1, 2, &ch, 1);
break;
case '\\':
ch = '\\';
str2.replace(pos-1, 2, &ch, 1);
break;
case 'u': {
std::istringstream iss(str2.substr(pos+1, 4));
unsigned short u;
if (!(iss >> std::hex >> u)) {
ch = '?';
}
else
{
switch (u)
{
case 0x2019:
ch = '\'';
break;
...
default:
if (u <= 256)
ch = static_cast<char>(u);
else
ch = '?';
break;
}
}
str2.replace(pos-1, 6, &ch, 1);
break;
}
}
pos = str2.find('\\', pos);
}
Live Demo
Related
I get the error "Cannot access memory at address 0x100403055" when I try and set a memory value to 0x00 when stopped in the debugger.
Is there a special switch I need to set to enable the set operation?
Here is my complete C code file "main.c"
#include <stdio.h>
#include <string.h>
/*
separator - consume all non-token characters until next token. This includes:
comments: '#'
nesting: '{'
unnesting: '}'
whitespace: ' ','\t','\n'
*nest is changed according to nesting/unnesting processed
*/
static void separator(int *nest, char **tokens) {
char c, *s;
s = *tokens;
while ((c = *s)) {
/* #->eol = comment */
if (c == '#') {
s++;
while ((c = *s)) {
s++;
if (c == '\n')
break;
}
continue;
}
if (c == '{') {
(*nest)++;
s++;
continue;
}
if (c == '}') {
(*nest)--;
s++;
continue;
}
if (c == ' ' || c == '\n' || c == '\t') {
s++;
continue;
}
break;
}
*tokens = s;
}
/*
token - capture all characters until next separator, then consume separator,
return captured token, leave **tokens pointing to next token.
*/
static char *token(int *nest, char **tokens) {
char c, *s, *t;
char terminator = '\0';
s = t = *tokens;
while ((c = *s)) {
if (c == '#'
|| c == ' ' || c == '\t' || c == '\n' || c == '{' || c == '}')
break;
s++;
}
*tokens = s;
separator(nest, tokens);
/* Breakpoint here to examine and manipulate memory */
*s = '\0';
return t;
}
struct test_case {
char *input;
int nest;
char *expected_output;
};
int main() {
int nest = 0;
int TESTSEP = 0;
if (TESTSEP>0) {
char *tokens = "# this is a comment\n{nesting {example} unnesting}\n \t end";
separator(&nest, &tokens);
printf("nest: %d\n", nest);
printf("tokens: %s\n", tokens);
return 0;
} else {
struct test_case test_cases[] = {
{"hello world", 0, "hello"},
{"hello#world", 0, "hello"},
{"hello{world}", 0, "hello"},
{"hello world", 0, "hello"},
{"hello\tworld", 0, "hello"},
{"hello\nworld", 0, "hello"},
};
for (int i = 0; i < sizeof(test_cases) / sizeof(test_cases[0]); i++) {
struct test_case test_case = test_cases[i];
char *tokens = test_case.input;
char *output = token(&test_case.nest, &tokens);
if (strcmp(output, test_case.expected_output) != 0) {
printf("Test case %d failed: expected %s, got %s\n", i, test_case.expected_output, output);
}
}
return 0;
}
}
In the token function there is a comment line where I place a breakpoint and drop into the gdb debugger. The code is supposed to write a '\0' at the address of the pointer *s to truncate the string.
When I'm in the debugger and I examine the 's' variable I get the following:
(gdb) x s
0x100403055: 0x726f7720
When I try and set the variable I get:
(gdb) [![set *0x0000000100403055 = 0x726f7700][1]][1]
Cannot access memory at address 0x100403055
I'm using the CLION IDE and am a novice. I'm not sure if its an IDE problem, a user problem or some external memory protection mechanism that is preventing this.
Does anyone know how to make this work?
Here is a screenshot of the IDE:
When I run the code (without the debugger) I get this output:
./explore.exe
Test case 0 failed: expected hello, got hello world
Test case 1 failed: expected hello, got hello#world
Test case 2 failed: expected hello, got hello{world}
Test case 3 failed: expected hello, got hello world
Test case 4 failed: expected hello, got hello world
Test case 5 failed: expected hello, got hello world
Process finished with exit code 0
I this case I believe I was passing in a pointer to memory in the read only space. The struct test_case is built into the code and is read only. So that when I pass that into the token function it was trying to write to read only.
Here is the code that seems to work.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
/*
separator - consume all non-token characters until next token.
This includes:
comments: '#' ... '\n'
nesting: '{'
unnesting: '}'
whitespace: ' ','\t','\n'
*nest is changed according to nesting/unnesting processed
*/
static void separator(int *nest, char **tokens) {
char c, *s;
s = *tokens;
while ((c = *s)) {
/* #->eol = comment */
if (c == '#') {
s++;
while ((c = *s)) {
s++;
if (c == '\n')
break;
}
continue;
}
if (c == '{') {
(*nest)++;
s++;
continue;
}
if (c == '}') {
(*nest)--;
s++;
continue;
}
if (c == ' ' || c == '\n' || c == '\t') {
s++;
continue;
}
break;
}
*tokens = s;
}
/*
token - capture all characters until next separator, then consume
separator,
return captured token, leave **tokens pointing to next token.
*/
static char *token(int *nest, char **tokens) {
char c, *s, *t;
char terminator = '\0';
s = t = *tokens;
while ((c = *s)) {
if (c == '#'
|| c == ' ' || c == '\t' || c == '\n' || c == '{' || c == '}')
break;
s++;
}
*tokens = s;
separator(nest, tokens);
*s = '\0';
return t;
}
struct test_case {
char *input;
int nest;
char *expected_output;
};
int main() {
int nest = 0;
int TESTSEP = 0;
char *temp_malloc_string;
if (TESTSEP>0) {
char *tokens = "# this is a comment\n{nesting {example}
unnesting}\n \t end";
temp_malloc_string = malloc(strlen(tokens)*sizeof(char));
strcpy(temp_malloc_string, tokens);
char * t = token(&nest, &temp_malloc_string);
printf("nest: %d\n", nest);
printf("tokens: %s\n", t);
separator(&nest, &temp_malloc_string);
printf("nest: %d\n", nest);
printf("tokens: %s\n", temp_malloc_string);
return 0;
} else {
struct test_case test_cases[] = {
{"hello world", 0, "hello"},
{"hello#world", 0, "hello"},
{"hello{world}", 0, "hello"},
{"hello world", 0, "hello"},
{"hello\tworld", 0, "hello"},
{"hello\nworld", 0, "hello"},
};
for (int i = 0; i < sizeof(test_cases) / sizeof(test_cases[0]); i++) {
struct test_case test_case = test_cases[i];
char *tokens = test_case.input;
printf("len of string is %d\n", strlen(tokens));
temp_malloc_string = malloc((strlen(tokens)+1)*sizeof(char));
char * tt = temp_malloc_string;
if ( temp_malloc_string==NULL ) {
printf("error!\n");
}
strcpy(temp_malloc_string, tokens);
printf("tm going in: %s\n", temp_malloc_string);
char *output = token(&test_case.nest, &temp_malloc_string);
printf("Test case %d: expected %s, got %s\n\t\ttm is now: %s\n",
i, test_case.expected_output, output, temp_malloc_string);
if (strcmp(output, test_case.expected_output) != 0) {
printf("Test case %d failed: expected %s, got %s\n",
i, test_case.expected_output, output);
}
free(tt);
temp_malloc_string = NULL;
}
return 0;
}
}
Now when I run the code I get:
./explore.exe
len of string is 11
tm going in: hello world
Test case 0: expected hello, got hello
tm is now: world
len of string is 11
tm going in: hello#world
Test case 1: expected hello, got hello
tm is now:
len of string is 12
tm going in: hello{world}
Test case 2: expected hello, got hello
tm is now: world}
len of string is 12
tm going in: hello world
Test case 3: expected hello, got hello
tm is now: world
len of string is 11
tm going in: hello world
Test case 4: expected hello, got hello
tm is now: world
len of string is 11
tm going in: hello
world
Test case 5: expected hello, got hello
tm is now: world
Process finished with exit code 0
And when I stop at the breakpoint I can write to memory.
In this modified code I malloc a char* object and copy the string from the struct into that then pass that into the token function.
I'm guess that gdb is protecting me from writing to the .text block in code.
Like I said: I'm a newbie :(
So, I'm working on this project for Comp 272, Data Structures and Algorithms, and before anyone asks I have no one to help me. It's an online program through Athabasca University and for some unknown reason they didn't supply me with a tutor for this course, which is a first... So... Yeah. The question is as follows:
"(20 marks) Exercise 8.2. Illustrate what happens when the sequence 1, 5, 2, 4, 3 is added to an empty ScapegoatTree, and show where the credits described in the proof of Lemma 8.3 go, and how they are used during this sequence of additions."
This is my code, its complete and it compiles:
/*
Name: Westcott.
Assignment: 2, Question 3.
Date: 08-26-2022.
"(20 marks) Exercise 8.2. Illustrate what happens when the sequence 1, 5, 2, 4, 3 is added to an empty
ScapegoatTree, and show where the credits described in the proof of Lemma 8.3 go, and how they are used
during this sequence of additions."
*/
#include <iostream>
using namespace std;
class Node { // Originally I did this with Node as a subclass of sgTree but I found that this
public: // way was easier. This is actually my second attempt, from scratch, at doing this
int data; // problem. First version developed so many bugs I couldn't keep up with them.
Node* left;
Node* right;
Node* parent;
Node() : data(0), parent(NULL), left(NULL), right(NULL) {};
Node(int x) : data(x), parent(NULL), left(NULL), right(NULL) {};
~Node() {}; // Normally I would do a little more work on clean up but... Yea this problem didn't leave me much room.
Node* binarySearch(Node* root, int x); // The Node class only holds binarySearch in addition to its
// constructors/destructor, and of course the Node*'s left, right and parent.
};
class sgTree { // The sgTree keeps track of the root, n (the number of nodes in the tree), and q which is
public: // as Pat put it a 'high water mark'.
Node* root;
int n;
int q;
sgTree() : root(new Node()), n(1), q(1) {}
sgTree(int x) : root(new Node(x)), n(0), q(0) {}
~sgTree() {
delete root;
}
bool add(int x); // The add function is compounded, within it are findDepth and rebuild.
bool removeX(int x); // removeX works, but it didn't have a big part to play in this question,
int findDepth(Node* addedNode); // but I'll include it to maintain our sorted set interface.
void printTree(Node* u, int space) { // This was extra function I wrote to help me problem solve.
cout << "BINARY TREE DISPLAY" << endl; // this version only prints a title and then it calls printTreeSub on line 46.
cout << "________________________________________________\n\n" << endl;
printTreeSub(u, space);
cout << "________________________________________________\n\n" << endl;
}
int printTreeSub(Node* u, int space); // Function definition for this is on line 81.
int storeInArray(Node* ptr, Node* arr[], int i);// this is our function for storing all the elements of a tree in an array.
int size(Node* u); // this is size, defined on line 74.
void rebuild(Node* u); // And rebuild and buildBalanced are the stars of the show, defined on lines 262 and 282
Node* buildBalanced(Node** a, int i, int ns); // just above the main() funciton.
};
int log32(int q) { // As you can see there's two versions of this function.
int c = 0; // this is supposed to return the log of n to base 3/2.
while (q != 0) { // The version below I got from this website:
q = q / 2; // https://www.geeksforgeeks.org/scapegoat-tree-set-1-introduction-insertion/
c++; // It works fine but I prefer the one I wrote.
} // this is a much simpler function. It just divides q until its zero
return c; // and increments c on each division. Its not exact but it is based on what Pat said
} // in this lecture: https://www.youtube.com/watch?v=OGNUoDPVRCc&t=4852s
/*
static int const log32(int n)
{
double const log23 = 2.4663034623764317;
return (int)ceil(log23 * log(n));
}
*/
int sgTree::size(Node* u) {
if (u == NULL) {
return 0;
}
return 1 + size(u->left) + size(u->right); // Recursion in size();
}
int sgTree::printTreeSub(Node* u, int space) { // Here is my strange print function
if (u == NULL) return space; // I say strange because I'm not even 100% sure
space--; // how I got it to work. The order itself I worked out, but I built it
space -= printTreeSub(u->left, space); // and, originally, got a half decent tree, but then I just kept playing
if (u->right == NULL && u->left == NULL) { // around with increments, decrements, and returned values
cout << "\n\n\n" << u->data << "\n\n\n" << endl; // of space until it just sort of came together.
return 1; // Basically it prints the left most Node first and then prints every node
} // beneath that using recursion. I realized that by setting the for loop
for (int i = space; i >= 0; i--) { // on line 89 I could imitate different nodes having different heights in
cout << " "; // the tree. I figured that using n as an input I could take advantage of
} // the recursion to get an accurate tree. That much I understand.
cout << " " << u->data << "'s children are: "; // But it didn't work out quite how I wanted it to so I just kept playing
if (u->left != NULL) { // with space increments and decrements on different sides of the tree until
cout << u->left->data; // I got something pretty good.
}
else {
cout << "NULL";
}
if (u->right != NULL) {
cout << " and " << u->right->data;
}
else {
cout << " NULL";
}
cout << "\n\n" << endl;
space--;
space -= printTreeSub(u->right, space);
return 1;
}
int sgTree::storeInArray(Node* ptr, Node* a[], int i) { // This function took me a while to figure out.
if (ptr == NULL) { // The recursive insertions of values using i, when
return i; // i is defined by the very same recursion, makes this
} // a bit of a challenge to get your head around.
i = storeInArray(ptr->left, a, i); // Basically its just taking advantage on an inOrder
a[i] = ptr; // transversal to get the values stored into the array
i++; // in order from least to greatest.
return storeInArray(ptr->right, a, i);
}
Node* Node::binarySearch(Node* root, int x) { // I covered this in another question.
if (root->data == x) {
return root;
}
else if (x < root->data) {
if (root->left == NULL) {
return root;
}
return binarySearch(root->left, x);
}
else if (x > root->data) {
if (root->right == NULL) {
return root;
}
return binarySearch(root->right, x);
}
}
bool sgTree::add(int x) { // The add function itself isn't too difficult.
Node* addedNode = new Node(x); // We make a Node using our data, then we search for that Node
Node* parent = root->binarySearch(root, x); // in the tree. I amended binarySearch to return the parent
addedNode->parent = parent; // if it hits a NULL child, on lines 127 and 133.
if (x < parent->data) { // That way the new Node can just go into the returned parents child
parent->left = addedNode; // here is where we choose whether it enters the left or the right.
}
else if (x > parent->data) {
parent->right = addedNode;
}
int h = findDepth(addedNode); // We run findDepth() on the addedNode. I realize that this probably should
// have been a part of the binarySearch, it means we go down
if (h > log32(q)) { // the tree twice instead of once. I did look at changing binarySearch into searchAndDepth
// having binarySearch return an int for the height isn't a problem, but then that would
// mess up removeX and, I don't know. What's more important?
Node* w = addedNode->parent; // If this were going to be a database hosting millions of pieces of data I would give
while (3 * size(w) < 2 * size(w->parent)) { // that alot more consideration but, this is just an exercise after all so...
w = w->parent; // From there, we compare our height to the value output by log32(q) on line 152.
}
rebuild(w); // This expression 3 * size(w) < 2 * size(w->parent) is the formula on page 178 rewritten
//rebuild(root); // as a cross multiplication, clever. It keeps going up the tree until we find the scapegoat w.
// This is a much nicer result.
//See line 311.
} // Now, this is where my problems began. Pat says that this line should read: rebuild(w->parent);
n++; // but when I do that I get an error when w is the root. Because then w->parent is NULL. And in that case
q++; // line 258 throws an error because we're trying to set p equal to NULL's parent. It's not there.
return true; // So my work around was to just offset this by one and send rebuild(w). But that doesn't seem
} // to balance the tree just right. In fact, the best tree results when we replace w with root.
// and just rebalance the whole tree. But in any case, we increment n and q and lets pick this up on line 256.
int sgTree::findDepth(Node* addedNode) {
int d = 0;
while (addedNode != root) {
addedNode = addedNode->parent;
d++;
}
return d;
}
bool sgTree::removeX(int x) {
Node* u = root->binarySearch(root, x);
if (u->left == NULL && u->right == NULL) {
if (u == u->parent->left) {
u->parent->left = NULL;
}
if (u == u->parent->right) {
u->parent->right = NULL;
}
cout << u->data << " deleted" << endl;
n--;
delete u;
return true;
}
if (u->left != NULL && u->right == NULL) {
if (u->parent->left = u) {
u->parent->left = u->left;
}
else if (u->parent->right = u) {
u->parent->right = u->left;
}
cout << u->data << " deleted" << endl;
n--;
delete u;
return true;
}
if (u->left == NULL && u->right != NULL) {
if (u == u->parent->left) {
u->parent->left = u->right;
u->right->parent = u->parent;
}
else if (u == u->parent->right) {
u->parent->right = u->right;
u->right->parent = u->parent;
}
cout << u->data << " deleted" << endl;
n--;
delete u;
return true;
}
if (u->left != NULL && u->right != NULL) {
Node* X = u->right;
if (X->left == NULL) {
X->left = u->left;
if (u->parent != NULL) {
if (u->parent->right == u) {
u->parent->right == X;
}
else if (u->parent->left == u) {
u->parent->left = X;
}
}
else {
root = X;
}
X->parent = u->parent;
cout << u->data << " deleted" << endl;
n--;
delete u;
return true;
}
while (X->left != NULL) {
X = X->left;
}
X->parent->left = NULL;
X->left = u->left;
X->right = u->right;
if (u->parent != NULL) {
X->parent = u->parent;
}
cout << u->data << " deleted" << endl;
n--;
root = X;
delete u;
return true;
}
}
void sgTree::rebuild(Node* u) {
int ns = size(u); // Everything is pretty kosher here. Just get the number of nodes in the subtree.
Node* p = u->parent; // Originally I had n here instead of ns and... I don't want to talk about how long it took me to find that mistake...
/* It's funny because while writing the comments for this I'm like "Oh, hang on, if I just push the definition of p behind the if statement on line 262
and evaluate for whether or not u is NULL instead of p, that should solve all my problems! Yea, no, it doesn't. Because then for some reason it tries rebalancing
empty tree and... Yea I just have to stop myself from trying to fix this because everytime I do I get caught in an infinite loop of me chasing my tail in errors.
I think a solution could be found in buildBalanced, and I literally went through that function line by line, trying to comprehend a work around. I've included at
a photograph of that white board. Yea this is the code that Pat gave us... and its garbage. It doesn't work. Maybe its a C++ thing, I don't know... But I'm
getting frustrated again so I'm going to stop thinking about this part RIGHT HERE, and move on LOL*/
Node** a = new Node * [ns]; // a Node pointer-pointer array... again, another fine piece of code from the textbook. Sorry, trying to stay positive here.
storeInArray(u, a, 0); // See Line 112
if (p == NULL) { // Okay, once we have our array we use buildBalanced to rebuild the subtree with respect to which
root = buildBalanced(a, 0, ns); // child u is relative to its parent.
root->parent = NULL; // See line 281 for buildBalanced().
}
else if (p->right == u) {
p->right = buildBalanced(a, 0, ns);
p->right->parent = p;
}
else {
p->left = buildBalanced(a, 0, ns);
p->left->parent = p;
}
}
Node* sgTree::buildBalanced(Node** a, int i, int ns) { // This is without a doubt one of the hardest functions I've ever had
if (ns == 0) { // the displeasure of trying to understand... Trying to stay positive.
return NULL; // I've gone through it, in a line by line implementation of the array:
} // a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10} you can find that analysis in
int m = ns / 2; // the photo buildBalanced_Analysis.
a[i + m]->left = buildBalanced(a, i, m); // As confusing as it is, I have to admit that it is a beautiful function.
if (a[i + m]->left != NULL) { // It basically uses the two integers i and m to simultaneously
a[i + m]->left->parent = a[i + m]; // regulate the organization of the new tree and to specifically
} // grab the right value from the array when its needed.
a[i + m]->right = buildBalanced(a, i + m + 1, ns - m - 1); // but trying to map this out didn't help me to solve the issues I've been having.
if (a[i + m]->right != NULL) {
a[i + m]->right->parent = a[i + m];
}
return a[i + m];
}
int main() {
sgTree newTree(1);
int a[] = { 5, 2, 4, 3 };
for (int i = 0; i < (sizeof(a) / sizeof(a[0])); i++) {
newTree.add(a[i]);
}
newTree.printTree(newTree.root, newTree.n);
/*
This is a nice test, when paired with rebuild(root), that too me is the only thing that approaches redeeming this whole question.
sgTree newTreeB(1);
int b[] = { 2, 3, 4, 5, 6, 7, 8, 9, 10 };
for (int i = 0; i < (sizeof(b) / sizeof(b[0])); i++) {
newTreeB.add(b[i]);
}
newTreeB.printTree(newTreeB.root, newTreeB.n);
*/
}
Now the issue itself is not that hard to understand. My tree should look like this:
But instead, it looks like this, with 5 at the root and the values 1 and 4 as the leaves:
I'm confident that the problem lives somewhere around line 159 and in those first few calls to buildBalanced. The comments in the code itself elaborate more on the issue. I've spent days just pouring over this trying everything I can think of to make it work and... Yeah... I just can't figure it out.
I have a long string of about 50,000,000 long... , and I am substituting it part by part
cat FILE | tail -n+2 | awk -v k=100 '{
i = 1
while (i<length($0)-k+1) {
x = substr($0, i, k)
if (CONDITION) {
x changed sth
$0 = substr($0,1,i-1) x substr($0,i+k)
}
i += 1
}
gsub(sth,sth,$0)
printf("%s",$0) >> FILE
}'
Are there any ways to replace $0 at position i with x of length k without using this method?
The string is too long and the commands runs extremely slow
sample input:
NNNNNNNNNNggcaaacagaatccagcagcacatcaaaaagcttatccacAGTAATTCATTATATCAAAATGCTCCAggccaggcgtggtggcttatgcc
sample output:
NNNNNNNNNNggcnnncngnnnccngcngcncnncnnnnngcnnnnccncNGNNNNNCNNNNNNNCNNNNNGCNCCNggccnggcgnggnggcnnnngcc
If substring with length k=10 contains >50% of A || a || T || t
(so there are length($0)-k+1 substrings)
substitute A and T with N, a and t with n
The $0 string must maintain it size and sequence (Case sensitive)
EDIT:
I misunderstood the requirement of this problem, and repost the question at here.
Basically:
read a window of characters to two buffers - scratch buffer and output buffer
if in the scratch buffer there are more then some count of characters ATat
then replace all characters ATat in the output buffer buffer to Nn respectively
output one character from the output buffer
flush one character in both buffers
and go to step 1 to repeat reading the characters into buffers
when the end of line is encountered, just flush output buffer and reset it all
A small C program for sure is going to be the fastest:
// The window size
#define N 10
// The percent of the window that has to be equal to one of [AaTt]
#define PERCENT 50
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
// output a string
static void output(char *outme, size_t n) {
fwrite(outme, n, 1, stdout);
}
// is one of [AaTt]
static bool is_one_of_them(char c) {
switch(c) {
case 'A':
case 'a':
case 'T':
case 't':
return true;
}
return false;
}
// Convert one of characters to n/N depending on case
static char convert_them_to_n(char c) {
// switch(c){ case 'T': case 'A': return true; } return false;
// ASCII is assumed
const char m = ~0x1f;
const char w = 'n' & ~m;
return (c & m) | w;
}
static const unsigned threshold = N * PERCENT / 100;
// Store the input in buf
static char buf[N];
// Store the output to-be-outputted in out
static char out[N];
// The current position in buf and out
// The count of readed characters
static size_t pos;
// The count of one of searched characters in buf
static unsigned count_them;
static void buf_reset(void) {
pos = 0;
count_them = 0;
}
static void buf_flush(void) {
output(out, pos);
buf_reset();
}
static void buf_replace_them(void) {
// TODO: this could keep count of characters alrady replaced in out to save CPU
for (size_t i = 0; i < N; ++i) {
if (is_one_of_them(out[i])) {
out[i] = convert_them_to_n(out[i]);
}
}
}
static void buf_flush_one(void) {
assert(pos > 0);
assert(pos == N);
output(out, 1);
count_them -= is_one_of_them(buf[0]);
memmove(buf, buf + 1, pos - 1);
memmove(out, out + 1, pos - 1);
pos--;
}
static void buf_add(char c) {
buf[pos] = out[pos] = c;
pos++;
count_them += is_one_of_them(c);
// if we reached the substring length
if (pos == N) {
// if the count reached the threshold
if (count_them >= threshold) {
// convert the characters to n
buf_replace_them();
}
// flush one character only at a time
buf_flush_one();
}
}
int main() {
int c;
buf_reset();
while ((c = getchar()) != EOF) {
if (c == '\n') {
// If its a newline, just flush what we have buffered
buf_flush();
output("\n", 1);
continue;
}
buf_add(c);
}
buf_flush();
}
Such a C program is easily transferable to for example an awk script, just one need to read one character at a time. Below I split the characters with split, like:
awk -v N=10 -v percent=50 '
BEGIN{ threshold = N * percent / 100; pos=0 }
function is_one_of_them(c) {
return c ~ /^[aAtT]$/;
}
function buf_flush(i) {
for (i = 0; i < pos; ++i) {
printf "%s", out[i]
}
pos = 0
count_them = 0
}
function buf_replace_them(i) {
for (i = 0; i < pos; ++i) {
if (is_one_of_them(out[i])) {
out[i] = out[i] ~ /[AT]/ ? "N" : "n";
}
}
}
function buf_flush_one(i) {
printf "%s", out[0]
count_them -= is_one_of_them(buf[0])
if(0 && debug) {
printf(" count_them %s ", count_them)
for (i = 0; i < pos-1; ++i) {
printf("%s", buf[i+1])
} printf(" ");
for (i = 0; i < pos-1; ++i) {
printf("%s", out[i+1])
}
printf("\n");
}
for (i = 0; i < pos-1; ++i) {
buf[i] = buf[i+1]
out[i] = out[i+1]
}
pos--
}
function buf_add(c) {
buf[pos]=c; out[pos]=c; pos++
count_them += is_one_of_them(c)
if (pos == N) {
if (count_them >= threshold) {
buf_replace_them()
}
buf_flush_one()
}
}
{
split($0, chars, "")
for (idx = 0; idx <= length($0); idx++) {
buf_add(chars[idx])
}
buf_flush();
printf "\n";
}
'
Both programs when run with the input presented in the first line produce the output presented in the second line (note that lone a near the end is not replaced, because there are no 5 charactets ATat in a window of 10 characters from it):
NNNNNNNNNNggcaaacagaatccagcagcacatcaaaaagcttatccacAGTAATTCATTATATCAAAATGCTCCAggccaggcgtggtggcttatgcc
NNNNNNNNNNggcnnncngnnnccngcngcncnncnnnnngcnnnnccncNGNNNNNCNNNNNNNCNNNNNGCNCCNggccaggcgnggnggcnnnngcc
Both solutions were tested on repl.
You need to be careful with how you address this problem. You cannot work on the substituted string. You need to keep track of the original string. Here is a simple example. Assume we have a string consisting of x and y and we want to replace all y with z if there are 8 y in a substring of 10. Imagine your input looks like:
yyyyyyyyxxy
The first substring of 10 reads yyyyyyyyxx and would be translated into zzzzzzzzxx. If you perform the substitution directly into the original string, you get zzzzzzzzxxy. The second substring now reads zzzzzzzxxy, and does not contain 8 times y, while in the original string it does. So according to the solution of the OP, this would lead into inconsistent results, depending on if you start from the front or the back. So a quick solution would be:
awk -v N=10 -v p=50 '
BEGIN { n = N*p/100 }
{ s = $0 }
{ for(i=1;i<=length-N;++i) {
str=substr($0,i,N)
c=gsub(/[AT]/,"N",str) + gsub(/[at]/,"n",str)
if(c >= n) s = substr(s,1,i-1) str substr(s,i+N)
}
}
{ print s }' file
There is ofcourse quite some work you do double here. Imagine you have a string of the form xxyyyyyyyyxx, you would perform 4 concatinations while you only need to do one. So the best idea is to minimalise the work and only check the substrings which end with the respective character:
awk -v N=10 -v p=50 '
BEGIN { n = N*p/100 }
{ s = $0 }
{ i=N; while (match(substr($0,i),/[ATat]/)) {
str=substr($0,i+RSTART-N,N)
c=gsub(/[AT]/,"N",str) + gsub(/[at]/,"n",str)
if(c >= n) { s = substr(s,1,i+RSTART-N-1) str substr(s,i+RSTART)}
i=i+RSTART
}
}
{ print s }' file
To replace $0 at position i with x do:
awk 'BEGIN{i=12345;x="blubber"}
{
printf("%s",substr($0,1,i));
printf("%s",x);
printf("%s",substr($0,i+length(x)));
}'
I don't think there is any faster method.
To replace AGCT with N and agct with n use tr. To replace them only within a range and using awk you should do:
awk 'BEGIN{i=12345;n=123}
{
printf("%s",substr($0,1,i-1));
printf(gsub(/[atgc]/,"n",gsub(/[ATGC]/,"N",substr($0,i,i+n-1))));
printf("%s",substr($0,i+n));
}'
To do more advanced and faster processing you should consider c/c++.
I just saw that this could technically work, the only mistake I couldn´t resolve was the last ASCII character that gets printed everytime I test it out, I also tested this out without using the name variable, I mean just making a substraction of 32 to any lower case letter in ASCII should give me their upper case one and it does, but I´m curious on why I´m getting an additional char, wich from what I see in screen is apparently Û.
#include <stdio.h>
main()
{
char name[22];
int i;
fputs("Type your name ",stdout);
fgets(name,22,stdin);
for (i = 0; name[i] != '\0'; i = i + 1)
printf("%c",(name[i])-32); /*This will convert lower case to upper */
/* using as reference the ASCII table*/
fflush(stdin);
getchar();
}
Perhaps there is a line break character at the end of the string.
You can check the chararacter code, so that you only convert characters that actually are lower case letters:
for (i = 0; name[i] != '\0'; i = i + 1) {
char c = name[i];
if (c => 97 && c <= 122) {
c -= 32;
}
printf("%c", c);
}
void read_chararray(char in_array[], int* Length)
{
int Indx = 0, Indx2 = 0, Indx3 = 0; // int declarations for indexs of some loops
char cinput = { 0 }, word[255] = { 0 }, word2[255] = { 0 }; // declaration of cinput and first char array before punctiation removed
for (Indx = 0; (cinput = getchar()) != '\n'; Indx++) { // Loop for getting characters from user stop at <enter>
word[Indx] = cinput; // Placing char into array while changing to lowercase
}
Indx2 = Indx; // Set Indx2 to Indx for loop operation
for (Indx = 0; Indx < Indx2; Indx++) { // Loop to check and replace upper characters with lower
cinput = word[Indx];
if (cinput >= 65 && cinput <= 90) { // If cinput is within the ASCII range 65 and 90, this indicates upper characters
cinput += 32; // Add 32 to cinput to shift to the lower character range within the ASCII table
in_array[Indx] = cinput; // Input new value into array pointer
}
else if (cinput >= 97 && cinput <= 122) // scans if character are lower ASCII, places them in array irraticating punctuation and whitespce
in_array[Indx] = cinput; // Input remaining lower case into array pointer
}
*Length = Indx; // final size of array set to Length variable for future use
}
#include<stdio.h>
void upper(char);
void main()
{
char ch;
printf("\nEnter the character in lower case");
scanf("%c", &ch);
upper(ch);
}
void upper( char c)
{
printf("\nUpper Case: %c", c-32);
}
i have this string:
12 4 the quick 99 -1 fox dog \
what i want in my program:
myArray[] = {12, 4, 99, -1};
how i do a multiple number scanning?
See my answer to your other question here. It's a relatively simple matter to replace the strtok section to recognize non-numeric words and neither increment the count (in the first pass) nor load them into the array (in the second pass).
The code has changed as follows:
Using an input file of:
12 3 45 6 7 8
3 5 6 7
7 0 -1 4 5
12 4 the quick 99 -1 fox dog \
it produces output along the lines of:
0x8e42170, size = 6:
12 3 45 6 7 8
0x8e421d0, size = 4:
3 5 6 7
0x8e421e0, size = 5:
7 0 -1 4 5
0x8e42278, size = 4:
12 4 99 -1
Here's the code that produced that output:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
// This is the linked list of integer arrays.
typedef struct _tIntArray {
int size;
int *array;
struct _tIntArray *next;
} tIntArray;
static tIntArray *first = NULL;
static tIntArray *last = NULL;
// Check that argument is numeric, optional minus sign followed by
// zero or more digits (you may want one or more).
static int isAllNumeric (char *word) {
char *s = word;
if (*s == '-')
s++;
for (; *s != '\0'; s++)
if ((*s < '0') || (*s > '9'))
return 0;
return 1;
}
// Add a line of integers as a node.
static int addNode (char *str) {
tIntArray *curr; // pointers for new integer array.
char *word; // word within string.
char *tmpStr; // temp copy of buffer.
int fldCnt; // field count for line.
int i;
// Count number of fields.
if ((tmpStr = strdup (str)) == NULL) {
printf ("Cannot allocate duplicate string (%d).\n", errno);
return 1;
}
fldCnt = 0;
for (word = strtok (tmpStr, " "); word; word = strtok (NULL, " "))
if (isAllNumeric (word))
fldCnt++;
free (tmpStr);
// Create new linked list node.
if ((curr = malloc (sizeof (tIntArray))) == NULL) {
printf ("Cannot allocate integer array node (%d).\n", errno);
return 1;
}
curr->size = fldCnt;
if ((curr->array = malloc (fldCnt * sizeof (int))) == NULL) {
printf ("Cannot allocate integer array (%d).\n", errno);
free (curr);
return 1;
}
curr->next = NULL;
for (i = 0, word = strtok (str, " "); word; word = strtok (NULL, " "))
if (isAllNumeric (word))
curr->array[i++] = atoi (word);
if (last == NULL)
first = last = curr;
else {
last->next = curr;
last = curr;
}
return 0;
}
int main(void) {
int lineSz; // current line size.
char *buff; // buffer to hold line.
FILE *fin; // input file handle.
long offset; // offset for re-allocating line buffer.
tIntArray *curr; // pointers for new integer array.
int i;
// Open file.
if ((fin = fopen ("qq.in", "r")) == NULL) {
printf ("Cannot open qq.in, errno = %d\n", errno);
return 1;
}
// Allocate initial line.
lineSz = 2;
if ((buff = malloc (lineSz+1)) == NULL) {
printf ("Cannot allocate initial memory, errno = %d.\n", errno);
return 1;
}
// Loop forever.
while (1) {
// Save offset in case we need to re-read.
offset = ftell (fin);
// Get line, exit if end of file.
if (fgets (buff, lineSz, fin) == NULL)
break;
// If no newline, assume buffer wasn't big enough.
if (buff[strlen(buff)-1] != '\n') {
// Get bigger buffer and seek back to line start and retry.
free (buff);
lineSz += 3;
if ((buff = malloc (lineSz+1)) == NULL) {
printf ("Cannot allocate extra memory, errno = %d.\n", errno);
return 1;
}
if (fseek (fin, offset, SEEK_SET) != 0) {
printf ("Cannot seek, errno = %d.\n", errno);
return 1;
}
continue;
}
// Remove newline and process.
buff[strlen(buff)-1] = '\0';
if (addNode (buff) != 0)
return 1;
}
// Dump table for debugging.
for (curr = first; curr != NULL; curr = curr->next) {
printf ("%p, size = %d:\n ", curr, curr->size);
for (i = 0; i < curr->size; i++)
printf (" %d", curr->array[i]);
printf ("\n");
}
// Free resources and exit.
free (buff);
fclose (fin);
return 0;
}