OCaml: new_line equivalent before 3.11 - compilation

I'm trying to compile Libra toolkit on a machine running Ubuntu Hardy with OCaml 3.10, I can't upgrade the OS nor update OCaml, and I don't know anything about OCaml. There is only one line that gives me an unbound value error because it uses the new_line function, which was introduced in OCaml 3.11 (http://caml.inria.fr/pub/docs/manual-ocaml/libref/Lexing.html), could someone tell me how to change it to be compatible with OCaml 3.10? It's the line near the end of this code:
{
open MnParseTypes;;
open MnParser;;
(* Raised when parsing ends *)
exception Eof;;
module L = Lexing
let linenum lexbuf = lexbuf.L.lex_curr_p.L.pos_lnum
let line = ref 1;;
let keywords = Hashtbl.create 10
let _ =
List.iter2 (Hashtbl.add keywords)
["mn"; "features"; "tree"; "table"; "w"; "eof"]
[Tmn; Tfeatures; Ttree; Ttable; Tweight; EOF];;
}
let digits = ['0'-'9']+
let identifier = ['a'-'z' 'A'-'Z']+
rule lexer = parse
(* eat blank characters *)
[' ' '\t'] {lexer lexbuf}
(* | "Feature list:" {lexer lexbuf} *)
| '{' {Tlbrace}
| '}' {Trbrace}
| '(' {Tlparen}
| ')' {Trparen}
| ('-')? "inf" {Tfloat( float_of_string(L.lexeme lexbuf))}
| identifier {
let x = String.lowercase (Lexing.lexeme lexbuf) in
try Hashtbl.find keywords x
with Not_found ->
failwith((Lexing.lexeme lexbuf)
^ ": unknown identifier on line " ^ string_of_int (linenum lexbuf))}
| digits {Tint (int_of_string (L.lexeme lexbuf))}
| ('-')? digits ('.' digits)? (['e' 'E'] ['+' '-']? digits)?
{Tfloat( float_of_string(L.lexeme lexbuf))}
| '+' 'v' (digits as var) '_' (digits as value)
{Tcond(true, int_of_string var, int_of_string value)}
| '-' 'v' (digits as var) '_' (digits as value)
{Tcond(false, int_of_string var, int_of_string value)}
| 'v' (digits as var) '_' (digits as value)
{Tvar( int_of_string var, int_of_string value)}
| ['\n' '\r']+ {L.new_line lexbuf; TEOL} (* THIS GIVES THE ERROR *)
| eof {EOF}
| _ {failwith((L.lexeme lexbuf) ^
": mistake on line " ^ string_of_int lexbuf.L.lex_curr_p.L.pos_lnum)}

In the directory of the OCaml sources (from SVN or a relase tarball), the source of the module Foo of the standard library will be in stdlib/foo.{ml,mli} (.mli is the interface file, .ml the implementation file). Looking at stdlib/lexing.ml gives you:
let new_line lexbuf =
let lcp = lexbuf.lex_curr_p in
lexbuf.lex_curr_p <- { lcp with
pos_lnum = lcp.pos_lnum + 1;
pos_bol = lcp.pos_cnum;
}
You can implement this in your code as well, using open Lexing to have the field names in scope, or using lexbuf.Lexing.lex_curr_p, and { lcp with Lexing.pos_lnum = lcp.Lexing.pos_lnum ... instead.
Edit: as you probably don't plan to hack the OCaml code yourself, let's give you the full thing:
let new_line lexbuf =
let lcp = lexbuf.Lexing.lex_curr_p in
lexbuf.Lexing.lex_curr_p <- { lcp with
Lexing.pos_lnum = lcp.Lexing.pos_lnum + 1;
Lexing.pos_bol = lcp.Lexing.pos_cnum;
}
add this at the top of the file that uses new_line (if it says Lexing.new_line, turn it into new_line), and you should be fine.

You can implement new_line yourself but I think upgrading OCaml would be better. I know that you said you can't upgrade the OS but installing the newer version of Ocaml can be done in your home directory without the need of any superuser privilege. OPAM is a packet manager for OCaml that makes it very easy to install the latest version of OCaml.

Related

OCaml - How to serialize and deserialize Yaml to records

What are the current community preferred libraries to parse and work with YAML and how do you use them to serialize and deserialize a record like this:
type book = {
title: string;
authors: string list
}
This is how I got string -> record and back.
$ opam update
$ opam install yaml ppx_deriving_yaml
Update dune with the preprocess clause:
; `dune` file
(executable
(name main)
(libraries yaml)
(preprocess
(pps ppx_deriving_yaml)))
Short version:
let serialize_book (book_rec : book) : (string, string) result =
let res = Yaml.to_string (book_to_yaml book_rec) in
map_error ~f:(fun (`Msg m) -> m) res
let deserialize_book (book_str : string) : (book, string) result =
let res =
Yaml.of_string book_str >>= fun yaml_value -> book_of_yaml yaml_value
in
map_error ~f:(fun (`Msg m) -> m) res
More verbose/descriptive version:
(* Define a record *)
(* `[##deriving yaml]` generates a bunch of functions, one being `book_to_yaml` to convert the record into a Yaml type, another `book_of_yaml` to convert Yaml type to record *)
type book = {
title: string;
authors: string list
} [##deriving yaml]
let serialize =
let (v: book) = { title = "Cryptonomicon"; authors = [ "Neal Stephenson" ] } in
(* `book_to_yaml` converts from record to `yaml res` where res is a Result *)
let yaml_structure = book_to_yaml v in
(* `to_string` converts from a `yaml` type'ed data structure to string *)
match Yaml.to_string yaml_structure with
| Ok s ->
print_endline ("Serialize:");
print_endline (s)
| Error (`Msg e) -> print_endline e
let deserialize =
let str = "title: Cryptonomicon\nauthors:\n- Neal Stephenson" in
(* `of_string converts from string to a `yaml res` data structure, where `res` is Result *)
match Yaml.of_string str with
| Ok yaml_value ->
(* `book_of_yaml` is generated by `[##deriving yaml]` *)
(* `book_of_yaml` converts from `yaml` type to `book res` where res is Result *)
(match book_of_yaml yaml_value with
| Ok t ->
print_endline ("Deserialize:");
print_endline ("Title: " ^ t.title);
print_endline ("Authors: " ^ String.concat ", " t.authors);
| Error `Msg e -> print_endline ("Error - convert to book: " ^ e))
| Error `Msg e -> print_endline ("Error - parsing: " ^ e)

Where is the canonical specification for proto3 that allows JavaScript-like object assignment to an option?

In the Protocol Buffers Version 3 Language Specification
The EBNF syntax for an option is
option = "option" optionName "=" constant ";"
optionName = ( ident | "(" fullIdent ")" ) { "." ident }
constant = fullIdent | ( [ "-" | "+" ] intLit ) | ( [ "-" | "+" ] floatLit ) | strLit | boolLit
ident = letter { letter | decimalDigit | "_" }
fullIdent = ident { "." ident }
strLit = ( "'" { charValue } "'" ) | ( '"' { charValue } '"' )
charValue = hexEscape | octEscape | charEscape | /[^\0\n\\]/
hexEscape = '\' ( "x" | "X" ) hexDigit hexDigit
octEscape = '\' octalDigit octalDigit octalDigit
charEscape = '\' ( "a" | "b" | "f" | "n" | "r" | "t" | "v" | '\' | "'" | '"' )
Or in plain English, an option may be assigned a dotted.notation.identifier, an integer, a float, a boolean, or a single- or double-quoted string, which MUST NOT have "raw" newline characters.
And yet, I'm encountering .proto files in various projects such as grpc-gateway and googleapis, where the rhs of the assignment is not quoted and spans multiple lines. For example in googleapis/google/api/http.proto there is this service definition in a comment block:
// service Messaging {
// rpc UpdateMessage(Message) returns (Message) {
// option (google.api.http) = {
// patch: "/v1/messages/{message_id}"
// body: "*"
// };
// }
// }
In other files, the use of semicolons (and occasionally commas) as separators seems somewhat arbitrary, and I have also seen keys repeated, which in JSON or JavaScript would result in loss of data due to overwriting.
Are there any canonical extensions to the language specification, or are people just Microsofting? (Yes, that's a verb now.)
I posted a similar question on the Protocol Buffers Google Group, and received a private message from a fellow at Google stating the following
This syntax is correct and valid for setting fields on a proto option field which is itself a field referencing a message type. This form is based on the TextFormat spec which I'm unclear if its super well documented, but here's an implementation of it: https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.text_format
When I have time, I will try to unpack what I learn from analyzing TextFormat.
update
I received an answer on the Groups forum
I think for better or worse, "what protoc implements" takes precedence over whatever the spec says. The spec came later and as far as I know we have not put a lot of effort into ensuring that it comprehensively matches the format that protoc expects. I believe the syntax you are looking at is missing from the .proto file format spec but is mentioned here as the "aggregate syntax."
The link above is to a section titled Custom Options in the Language Guide (proto2) page. If you scroll all the way to the end of that section, there is the following snippet that mentions TextFormat:
message FooOptions {
optional int32 opt1 = 1;
optional string opt2 = 2;
}
extend google.protobuf.FieldOptions {
optional FooOptions foo_options = 1234;
}
// usage:
message Bar {
optional int32 a = 1 [(foo_options).opt1 = 123, (foo_options).opt2 = "baz"];
// alternative aggregate syntax (uses TextFormat):
optional int32 b = 2 [(foo_options) = { opt1: 123 opt2: "baz" }];
}

how to build a parser with antlr4 target golang without visitors and walkers

I am trying to write a small parser with golang target, but not using visitors or walkers, but I am not able to find any sample code to build my parser upon.
For example, the following is the grammar code which I am trying to replicate with golang:
# Expr.g4:
grammar Expr;
#header {
}
#parser::members {
def eval(self, left, op, right):
if ExprParser.MUL == op.type:
return left * right
elif ExprParser.DIV == op.type:
return left / right
elif ExprParser.ADD == op.type:
return left + right
elif ExprParser.SUB == op.type:
return left - right
else:
return 0
}
stat: e NEWLINE {print($e.v);}
| ID '=' e NEWLINE {self.memory[$ID.text] = $e.v}
| NEWLINE
;
e returns [int v]
: a=e op=('*'|'/') b=e {$v = self.eval($a.v, $op, $b.v)}
| a=e op=('+'|'-') b=e {$v = self.eval($a.v, $op, $b.v)}
| INT {$v = $INT.int}
| ID
{
id = $ID.text
$v = self.memory.get(id, 0)
}
| '(' e ')' {$v = $e.v}
;
MUL : '*' ;
DIV : '/' ;
ADD : '+' ;
SUB : '-' ;
ID : [a-zA-Z]+ ; // match identifiers
INT : [0-9]+ ; // match integers
NEWLINE:'\r'? '\n' ; // return newlines to parser (is end-statement signal)
WS : [ \t]+ -> skip ; // toss out whitespace
And this is the python tester code for it:
# test_expr.py:
import sys
from antlr4 import *
from antlr4.InputStream import InputStream
from ExprLexer import ExprLexer
from ExprParser import ExprParser
if __name__ == '__main__':
parser = ExprParser(None)
parser.buildParseTrees = False
parser.memory = {} # how to add this to generated constructor?
line = sys.stdin.readline()
lineno = 1
while line != '':
line = line.strip()
istream = InputStream(line + "\n")
lexer = ExprLexer(istream)
lexer.line = lineno
lexer.column = 0
token_stream = CommonTokenStream(lexer)
parser.setInputStream(token_stream)
parser.stat()
line = sys.stdin.readline()
lineno += 1
Can anybody please post a sample golang code which is equivalent to the above python and inlined code?

ANTLR White Space Question (and not the typical one)

Consider this short SmallC program:
#include "lib"
main() {
int bob;
}
My ANTLR grammar picks it up fine if I specify, in ANTLWorks and when using the Interpreter, line endings -> "Mac (CR)". If I set the line endings option to Unix (LF), the grammar throws a NoViableAltException and does not recognize anything after the end of the include statement. This error disappears if I add a newline at the end of include. The computer I'm using for this is a Mac, so I figured that it made sense to have to set the line endings to Mac format. So instead, I switch to a Linux box - and get the same thing. If I type anything in the ANTLRWorks Interpreter box, and if I don't select line endings Mac (CR), I get issues about insufficient blank lines as was the case above and, in addition, the last statement of each statement block requires an extra space following the semicolon (ie. after bob; above).
These bugs show up again when I run a Java version of my grammar on a code input file that I want to parse...
What could possibly be the issue? I'd understand if the issue was the presence of TOO many new lines, in a format that perhaps the parser didn't understand / weren't caught by my whitespace rule. But in this case, it's an issue of lacking new lines.
My white space declaration is as follows:
WS : ( '\t' | ' ' | '\r' | '\n' )+ { $channel = HIDDEN; } ;
Alternatively, could this be due to an ambiguity issue?
Here is the full grammar file (feel free to ignore the first few blocks, which override ANTLR's default error handling mechanisms:
grammar SmallC;
options {
output = AST ; // Set output mode to AST
}
tokens {
DIV = '/' ;
MINUS = '-' ;
MOD = '%' ;
MULT = '*' ;
PLUS = '+' ;
RETURN = 'return' ;
WHILE = 'while' ;
// The following are empty tokens used in AST generation
ARGS ;
CHAR ;
DECLS ;
ELSE ;
EXPR ;
IF ;
INT ;
INCLUDES ;
MAIN ;
PROCEDURES ;
PROGRAM ;
RETURNTYPE ;
STMTS ;
TYPEIDENT ;
}
#members {
// Force error throwing, and make sure we don't try to recover from invalid input.
// The exceptions are handled in the FrontEnd class, and gracefully end the
// compilation routine after displaying an error message.
protected void mismatch(IntStream input, int ttype, BitSet follow) throws RecognitionException {
throw new MismatchedTokenException(ttype, input);
}
public Object recoverFromMismatchedSet(IntStream input, RecognitionException e, BitSet follow)throws RecognitionException {
throw e;
}
protected Object recoverFromMismatchedToken(IntStream input, int ttype, BitSet follow) throws RecognitionException {
throw new MissingTokenException(ttype, input, null);
}
// We override getErrorMessage() to include information about the specific
// grammar rule in which the error happened, using a stack of nested rules.
Stack paraphrases = new Stack();
public String getErrorMessage(RecognitionException e, String[] tokenNames) {
String msg = super.getErrorMessage(e, tokenNames);
if ( paraphrases.size()>0 ) {
String paraphrase = (String)paraphrases.peek();
msg = msg+" "+paraphrase;
}
return msg;
}
// We override displayRecognitionError() to specify a clearer error message,
// and to include the error type (ie. class of the exception that was thrown)
// for the user's reference. The idea here is to come as close as possible
// to Java's exception output.
public void displayRecognitionError(String[] tokenNames, RecognitionException e)
{
String exType;
String hdr;
if (e instanceof UnwantedTokenException) {
exType = "UnwantedTokenException";
} else if (e instanceof MissingTokenException) {
exType = "MissingTokenException";
} else if (e instanceof MismatchedTokenException) {
exType = "MismatchedTokenException";
} else if (e instanceof MismatchedTreeNodeException) {
exType = "MismatchedTreeNodeException";
} else if (e instanceof NoViableAltException) {
exType = "NoViableAltException";
} else if (e instanceof EarlyExitException) {
exType = "EarlyExitException";
} else if (e instanceof MismatchedSetException) {
exType = "MismatchedSetException";
} else if (e instanceof MismatchedNotSetException) {
exType = "MismatchedNotSetException";
} else if (e instanceof FailedPredicateException) {
exType = "FailedPredicateException";
} else {
exType = "Unknown";
}
if ( getSourceName()!=null ) {
hdr = "Exception of type " + exType + " encountered in " + getSourceName() + " at line " + e.line + ", char " + e.charPositionInLine + ": ";
} else {
hdr = "Exception of type " + exType + " encountered at line " + e.line + ", char " + e.charPositionInLine + ": ";
}
String msg = getErrorMessage(e, tokenNames);
emitErrorMessage(hdr + msg + ".");
}
}
// Force the parser not to try to guess tokens and resume on faulty input,
// but rather display the error, and throw an exception for the program
// to quit gracefully.
#rulecatch {
catch (RecognitionException e) {
reportError(e);
throw e;
}
}
/*------------------------------------------------------------------
* PARSER RULES
*
* Many of these make use of ANTLR's rewrite rules to allow us to
* specify the roots of AST sub-trees, and to allow us to do away
* with certain insignificant literals (like parantheses and commas
* in lists) and to add empty tokens to disambiguate the tree
* construction
*
* The #init and #after definitions populate the paraphrase
* stack to allow us to specify which grammar rule we are in when
* errors are found.
*------------------------------------------------------------------*/
args
#init { paraphrases.push("in these procedure arguments"); }
#after { paraphrases.pop(); }
: ( typeident ( ',' typeident )* )? -> ^( ARGS ( typeident ( typeident )* )? )? ;
body
#init { paraphrases.push("in this procedure body"); }
#after { paraphrases.pop(); }
: '{'! decls stmtlist '}'! ;
decls
#init { paraphrases.push("in these declarations"); }
#after { paraphrases.pop(); }
: ( typeident ';' )* -> ^( DECLS ( typeident )* )? ;
exp
#init { paraphrases.push("in this expression"); }
#after { paraphrases.pop(); }
: lexp ( ( '>' | '<' | '>=' | '<=' | '!=' | '==' )^ lexp )? ;
factor : '(' lexp ')'
| ( MINUS )? ( IDENT | NUMBER )
| CHARACTER
| IDENT '(' ( IDENT ( ',' IDENT )* )? ')' ;
lexp : term ( ( PLUS | MINUS )^ term )* ;
includes
#init { paraphrases.push("in the include statements"); }
#after { paraphrases.pop(); }
: ( '#include' STRING )* -> ^( INCLUDES ( STRING )* )? ;
main
#init { paraphrases.push("in the main method"); }
#after { paraphrases.pop(); }
: 'main' '(' ')' body -> ^( MAIN body ) ;
procedure
#init { paraphrases.push("in this procedure"); }
#after { paraphrases.pop(); }
: ( proc_return_char | proc_return_int )? IDENT^ '('! args ')'! body ;
procedures : ( procedure )* -> ^( PROCEDURES ( procedure)* )? ;
proc_return_char
: 'char' -> ^( RETURNTYPE CHAR ) ;
proc_return_int : 'int' -> ^( RETURNTYPE INT ) ;
// We hard-code the regex (\n)* to fix a bug whereby a program would be accepted
// if it had 0 or more than 1 new lines before EOF but not if it had exactly 1,
// and not if it had 0 new lines between components of the following rule.
program : includes decls procedures main EOF ;
stmt
#init { paraphrases.push("in this statement"); }
#after { paraphrases.pop(); }
: '{'! stmtlist '}'!
| WHILE '(' exp ')' s=stmt -> ^( WHILE ^( EXPR exp ) $s )
| 'if' '(' exp ')' s=stmt ( options {greedy=true;} : 'else' s2=stmt )? -> ^( IF ^( EXPR exp ) $s ^( ELSE $s2 )? )
| IDENT '='^ lexp ';'!
| ( 'read' | 'output' | 'readc' | 'outputc' )^ '('! IDENT ')'! ';'!
| 'print'^ '('! STRING ( options {greedy=true;} : ')'! ';'! )
| RETURN ( lexp )? ';' -> ^( RETURN ( lexp )? )
| IDENT^ '('! ( IDENT ( ','! IDENT )* )? ')'! ';'!;
stmtlist : ( stmt )* -> ^( STMTS ( stmt )* )? ;
term : factor ( ( MULT | DIV | MOD )^ factor )* ;
// We divide typeident into two grammar rules depending on whether the
// ident is of type 'char' or 'int', to allow us to implement different
// rewrite rules in each case.
typeident : typeident_char | typeident_int ;
typeident_char : 'char' s2=IDENT -> ^( CHAR $s2 ) ;
typeident_int : 'int' s2=IDENT -> ^( INT $s2 ) ;
/*------------------------------------------------------------------
* LEXER RULES
*------------------------------------------------------------------*/
// Must come before CHARACTER to avoid ambiguity ('i' matches both IDENT and CHARACTER)
IDENT : ( LCASE_ALPHA | UCASE_ALPHA | '_' ) ( LCASE_ALPHA | UCASE_ALPHA | DIGIT | '_' )* ;
CHARACTER : PRINTABLE_CHAR
| '\n' | '\t' | EOF ;
NUMBER : ( DIGIT )+ ;
STRING : '\"' ( ~( '"' | '\n' | '\r' | 't' ) )* '\"' ;
WS : ( '\t' | ' ' | '\r' | '\n' | '\u000C' )+ { $channel = HIDDEN; } ;
fragment
DIGIT : '0'..'9' ;
fragment
LCASE_ALPHA : 'a'..'z' ;
fragment
NONALPHA_CHAR : '`' | '~' | '!' | '#' | '#' | '$' | '%' | '^' | '&' | '*' | '(' | ')' | '-'
| '_' | '+' | '=' | '{' | '[' | '}' | ']' | '|' | '\\' | ';' | ':' | '\''
| '\\"' | '<' | ',' | '>' | '.' | '?' | '/' ;
fragment
PRINTABLE_CHAR : LCASE_ALPHA | UCASE_ALPHA | DIGIT | NONALPHA_CHAR ;
fragment
UCASE_ALPHA : 'A'..'Z' ;
From the command line, I do get a warning:
java -cp antlr-3.2.jar org.antlr.Tool SmallC.g
warning(200): SmallC.g:182:37: Decision can match input such as "'else'" using multiple alternatives: 1, 2
As a result, alternative(s) 2 were disabled for that input
but that won't stop the lexer/parser from being generated.
Anyway, the problem: ANTLR's lexer tries to match the first lexer rule it encounters in the file, and if it can't match said token, it trickles down to the next lexer rule. Now you have defined the CHARACTER rule before the WS rule, which both match the character \n. That is why it didn't work under Linux since the \n was tokenized as a CHARACTER. If you define the WS rule before the CHARACTER rule, it all works properly:
// other rules ...
WS
: ('\t' | ' ' | '\r' | '\n' | '\u000C')+ { $channel = HIDDEN; }
;
CHARACTER
: PRINTABLE_CHAR | '\n' | '\t' | EOF
;
// other rules ...
Running the test class:
import org.antlr.runtime.*;
import org.antlr.runtime.tree.*;
import org.antlr.stringtemplate.*;
public class Main {
public static void main(String[] args) throws Exception {
String source =
"#include \"lib\"\n" +
"main() {\n" +
" int bob;\n" +
"}\n";
ANTLRStringStream in = new ANTLRStringStream(source);
SmallCLexer lexer = new SmallCLexer(in);
CommonTokenStream tokens = new CommonTokenStream(lexer);
SmallCParser parser = new SmallCParser(tokens);
SmallCParser.program_return returnValue = parser.program();
CommonTree tree = (CommonTree)returnValue.getTree();
DOTTreeGenerator gen = new DOTTreeGenerator();
StringTemplate st = gen.toDOT(tree);
System.out.println(st);
}
}
produces the following AST:
without any error messages.
But you should fix the grammar warning, and remove \n from the CHARACTER rule since it can never be matched in the CHARACTER rule.
One other thing: you've mixed quite a few keywords inside your parser rules without defining them in your lexer rules explicitly. That is tricky because of the first-come-first-serve lexer rules: you don't want 'if' to be accidentally being tokenized as an IDENT. Better do it like this:
IF : 'if';
IDENT : 'a'..'z' ... ; // After the `IF` rule!

Code Golf: Email Address Validation without Regular Expressions

Locked. This question and its answers are locked because the question is off-topic but has historical significance. It is not currently accepting new answers or interactions.
(Edit: What is Code Golf: Code Golf are challenges to solve a specific problem with the shortest amount of code by character count in whichever language you prefer. More info here on Meta StackOverflow. )
Code Golfers, here's a challenge on string operations.
Email Address Validation, but without regular expressions (or similar parsing library) of course. It's not so much about the email addresses but how short you can write the different string operations and constraints given below.
The rules are the following (yes, I know, this is not RFC compliant, but these are going to be the 5 rules for this challenge):
At least 1 character out of this group before the #:
A-Z, a-z, 0-9, . (period), _ (underscore)
# has to exist, exactly one time
john#smith.com
^
Period (.) has to exist exactly one time after the #
john#smith.com
^
At least 1 only [A-Z, a-z] character between # and the following . (period)
john#s.com
^
At least 2 only [A-Z, a-z] characters after the final . period
john#smith.ab
^^
Please post the method/function only, which would take a string (proposed email address) and then return a Boolean result (true/false) depending on the email address being valid (true) or invalid (false).
Samples:
b#w.org (valid/true) #w.org (invalid/false)
b#c#d.org (invalid/false) test#org (invalid/false)
test#%.org (invalid/false) s%p#m.org (invalid/false)
j_r#x.c.il (invalid/false) j_r#x.mil (valid/true)
r..t#x.tw (valid/true) foo#a%.com (invalid/false)
Good luck!
C89 (166 characters)
#define B(c)isalnum(c)|c==46|c==95
#define C(x)if(!v|*i++-x)return!1;
#define D(x)for(v=0;x(*i);++i)++v;
v;e(char*i){D(B)C(64)D(isalpha)C(46)D(isalpha)return!*i&v>1;}
Not re-entrant, but can be run multiple times. Test bed:
#include<stdio.h>
#include<assert.h>
main(){
assert(e("b#w.org"));
assert(e("r..t#x.tw"));
assert(e("j_r#x.mil"));
assert(!e("b#c#d.org"));
assert(!e("test#%.org"));
assert(!e("j_r#x.c.il"));
assert(!e("#w.org"));
assert(!e("test#org"));
assert(!e("s%p#m.org"));
assert(!e("foo#a%.com"));
puts("success!");
}
J
:[[/%^(:[[+-/^,&i|:[$[' ']^j+0__:k<3:]]
C89, 175 characters.
#define G &&*((a+=t+1)-1)==
#define H (t=strspn(a,A
t;e(char*a){char A[66]="_.0123456789Aa";short*s=A+12;for(;++s<A+64;)*s=s[-1]+257;return H))G 64&&H+12))G 46&&H+12))>1 G 0;}
I am using the standard library function strspn(), so I feel this answer isn't as "clean" as strager's answer which does without any library functions. (I also stole his idea of declaring a global variable without a type!)
One of the tricks here is that by putting . and _ at the start of the string A, it's possible to include or exclude them easily in a strspn() test: when you want to allow them, use strspn(something, A); when you don't, use strspn(something, A+12). Another is assuming that sizeof (short) == 2 * sizeof (char), and building up the array of valid characters 2 at a time from the "seed" pair Aa. The rest was just looking for a way to force subexpressions to look similar enough that they could be pulled out into #defined macros.
To make this code more "portable" (heh :-P) you can change the array-building code from
char A[66]="_.0123456789Aa";short*s=A+12;for(;++s<A+64;)*s=s[-1]+257;
to
char*A="_.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
for a cost of 5 additional characters.
Python (181 characters including newlines)
def v(E):
import string as t;a=t.ascii_letters;e=a+"1234567890_.";t=e,e,"#",e,".",a,a,a,a,a,"",a
for c in E:
if c in t[0]:t=t[2:]
elif not c in t[1]:return 0>1
return""==t[0]
Basically just a state machine using obfuscatingly short variable names.
C (166 characters)
#define F(t,u)for(r=s;t=(*s-64?*s-46?isalpha(*s)?3:isdigit(*s)|*s==95?4:0:2:1);++s);if(s-r-1 u)return 0;
V(char*s){char*r;F(2<,<0)F(1=)F(3=,<0)F(2=)F(3=,<1)return 1;}
The single newline is required, and I've counted it as one character.
Python, 149 chars (after putting the whole for loop into one semicolon-separated line, which I haven't done here for "readability" purposes):
def v(s,t=0,o=1):
for c in s:
k=c=="#"
p=c=="."
A=c.isalnum()|p|(c=="_")
L=c.isalpha()
o&=[A,k|A,L,L|p,L,L,L][t]
t+=[1,k,1,p,1,1,0][t]
return(t>5)&o
Test cases, borrowed from strager's answer:
assert v("b#w.org")
assert v("r..t#x.tw")
assert v("j_r#x.mil")
assert not v("b#c#d.org")
assert not v("test#%.org")
assert not v("j_r#x.c.il")
assert not v("#w.org")
assert not v("test#org")
assert not v("s%p#m.org")
assert not v("foo#a%.com")
print "Yeah!"
Explanation: When iterating over the string, two variables keep getting updated.
t keeps the current state:
t = 0: We're at the beginning.
t = 1: We where at the beginning and have found at least one legal character (letter, number, underscore, period)
t = 2: We have found the "#"
t = 3: We have found at least on legal character (i.e. letter) after the "#"
t = 4: We have found the period in the domain name
t = 5: We have found one legal character (letter) after the period
t = 6: We have found at least two legal characters after the period
o as in "okay" starts as 1, i.e. true, and is set to 0 as soon as a character is found that is illegal in the current state.
Legal characters are:
In state 0: letter, number, underscore, period (change state to 1 in any case)
In state 1: letter, number, underscore, period, at-sign (change state to 2 if "#" is found)
In state 2: letter (change state to 3)
In state 3: letter, period (change state to 4 if period found)
In states 4 thru 6: letter (increment state when in 4 or 5)
When we have gone all the way through the string, we return whether t==6 (t>5 is one char less) and o is 1.
Whatever version of C++ MSVC2008 supports.
Here's my humble submission. Now I know why they told me never to do the things I did in here:
#define N return 0
#define I(x) &&*x!='.'&&*x!='_'
bool p(char*a) {
if(!isalnum(a[0])I(a))N;
char*p=a,*b=0,*c=0;
for(int d=0,e=0;*p;p++){
if(*p=='#'){d++;b=p;}
else if(*p=='.'){if(d){e++;c=p;}}
else if(!isalnum(*p)I(p))N;
if (d>1||e>1)N;
}
if(b>c||b+1>=c||c+2>=p)N;
return 1;
}
Not the greatest solution no doubt, and pretty darn verbose, but it is valid.
Fixed (All test cases pass now)
static bool ValidateEmail(string email)
{
var numbers = "1234567890";
var uppercase = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
var lowercase = uppercase.ToLower();
var arUppercase = uppercase.ToCharArray();
var arLowercase = lowercase.ToCharArray();
var arNumbers = numbers.ToCharArray();
var atPieces = email.Split(new string[] { "#"}, StringSplitOptions.RemoveEmptyEntries);
if (atPieces.Length != 2)
return false;
foreach (var c in atPieces[0])
{
if (!(arNumbers.Contains(c) || arLowercase.Contains(c) || arUppercase.Contains(c) || c == '.' || c == '_'))
return false;
}
if(!atPieces[1].Contains("."))
return false;
var dotPieces = atPieces[1].Split('.');
if (dotPieces.Length != 2)
return false;
foreach (var c in dotPieces[0])
{
if (!(arLowercase.Contains(c) || arUppercase.Contains(c)))
return false;
}
var found = 0;
foreach (var c in dotPieces[1])
{
if ((arLowercase.Contains(c) || arUppercase.Contains(c)))
found++;
else
return false;
}
return found >= 2;
}
C89 character set agnostic (262 characters)
#include <stdio.h>
/* the 'const ' qualifiers should be removed when */
/* counting characters: I don't like warnings :) */
/* also the 'int ' should not be counted. */
/* it needs only 2 spaces (after the returns), should be only 2 lines */
/* that's a total of 262 characters (1 newline, 2 spaces) */
/* code golf starts here */
#include<string.h>
int v(const char*e){
const char*s="0123456789._abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
if(e=strpbrk(e,s))
if(e=strchr(e+1,'#'))
if(!strchr(e+1,'#'))
if(e=strpbrk(e+1,s+12))
if(e=strchr(e+1,'.'))
if(!strchr(e+1,'.'))
if(strlen(e+1)>1)
return 1;
return 0;
}
/* code golf ends here */
int main(void) {
const char *t;
t = "b#w.org"; printf("%s ==> %d\n", t, v(t));
t = "r..t#x.tw"; printf("%s ==> %d\n", t, v(t));
t = "j_r#x.mil"; printf("%s ==> %d\n", t, v(t));
t = "b#c#d.org"; printf("%s ==> %d\n", t, v(t));
t = "test#%.org"; printf("%s ==> %d\n", t, v(t));
t = "j_r#x.c.il"; printf("%s ==> %d\n", t, v(t));
t = "#w.org"; printf("%s ==> %d\n", t, v(t));
t = "test#org"; printf("%s ==> %d\n", t, v(t));
t = "s%p#m.org"; printf("%s ==> %d\n", t, v(t));
t = "foo#a%.com"; printf("%s ==> %d\n", t, v(t));
return 0;
}
Version 2
Still C89 character set agnostic, bugs hopefully corrected (303 chars; 284 without the #include)
#include<string.h>
#define Y strchr
#define X{while(Y
v(char*e){char*s="0123456789_.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
if(*e!='#')X(s,*e))e++;if(*e++=='#'&&!Y(e,'#')&&Y(e+1,'.'))X(s+12,*e))e++;if(*e++=='.'
&&!Y(e,'.')&&strlen(e)>1){while(*e&&Y(s+12,*e++));if(!*e)return 1;}}}return 0;}
That #define X is absolutely disgusting!
Test as for my first (buggy) version.
VBA/VB6 - 484 chars
Explicit off
usage: VE("b#w.org")
Function V(S, C)
V = True
For I = 1 To Len(S)
If InStr(C, Mid(S, I, 1)) = 0 Then
V = False: Exit For
End If
Next
End Function
Function VE(E)
VE = False
C1 = "abcdefghijklmnopqrstuvwxyzABCDEFGHILKLMNOPQRSTUVWXYZ"
C2 = "0123456789._"
P = Split(E, "#")
If UBound(P) <> 1 Then GoTo X
If Len(P(0)) < 1 Or Not V(P(0), C1 & C2) Then GoTo X
E = P(1): P = Split(E, ".")
If UBound(P) <> 1 Then GoTo X
If Len(P(0)) < 1 Or Not V(P(0), C1) Or Len(P(1)) < 2 Or Not V(P(1), C1) Then GoTo X
VE = True
X:
End Function
Java: 257 chars (not including the 3 end of lines for readability ;-)).
boolean q(char[]s){int a=0,b=0,c=0,d=0,e=0,f=0,g,y=-99;for(int i:s)
d=(g="#._0123456789QWERTYUIOPASDFGHJKLZXCVBNMqwertyuiopasdfghjklzxcvbnm".indexOf(i))<0?
y:g<1&&++e>0&(b<1|++a>1)?y:g==1&e>0&(c<1||f++>0)?y:++b>0&g>12?f>0?d+1:f<1&e>0&&++c>0?
d:d:d;return d>1;}
Passes all the tests (my older version was incorrect).
Erlang 266 chars:
-module(cg_email).
-export([test/0]).
%%% golf code begin %%%
-define(E,when X>=$a,X=<$z;X>=$A,X=<$Z).
-define(I(Y,Z),Y([X|L])?E->Z(L);Y(_)->false).
-define(L(Y,Z),Y([X|L])?E;X>=$0,X=<$9;X=:=$.;X=:=$_->Z(L);Y(_)->false).
?L(e,m).
m([$#|L])->a(L);?L(m,m).
?I(a,i).
i([$.|L])->l(L);?I(i,i).
?I(l,c).
?I(c,g).
g([])->true;?I(g,g).
%%% golf code end %%%
test() ->
true = e("b#w.org"),
false = e("b#c#d.org"),
false = e("test#%.org"),
false = e("j_r#x.c.il"),
true = e("r..t#x.tw"),
false = e("test#org"),
false = e("s%p#m.org"),
true = e("j_r#x.mil"),
false = e("foo#a%.com"),
ok.
Ruby, 225 chars.
This is my first Ruby program, so it's probably not very Ruby-like :-)
def v z;r=!a=b=c=d=e=f=0;z.chars{|x|case x when'#';r||=b<1||!e;e=!1 when'.'
e ?b+=1:(a+=1;f=e);r||=a>1||(c<1&&!e)when'0'..'9';b+=1;r|=!e when'A'..'Z','a'..'z'
e ?b+=1:f ?c+=1:d+=1;else r=1 if x!='_'||!e|!b+=1;end};!r&&d>1 end
'Using no regex':
PHP 47 Chars.
<?=filter_var($argv[1],FILTER_VALIDATE_EMAIL);
Haskell (GHC 6.8.2), 165 161 144C Characters
Using pattern matching, elem, span and all:
a=['A'..'Z']++['a'..'z']
e=f.span(`elem`"._0123456789"++a)
f(_:_,'#':d)=g$span(`elem`a)d
f _=False
g(_:_,'.':t#(_:_:_))=all(`elem`a)t
g _=False
The above was tested with the following code:
main :: IO ()
main = print $ and [
e "b#w.org",
e "r..t#x.tw",
e "j_r#x.mil",
not $ e "b#c#d.org",
not $ e "test#%.org",
not $ e "j_r#x.c.il",
not $ e "#w.org",
not $ e "test#org",
not $ e "s%p#m.org",
not $ e "foo#a%.com"
]

Resources