How do I split an atom in Parslet? - ruby

I'm building an SQL-like query language. I would like to be able to handle lists of items delimited by commas. I have successfully achieved this with this code:
class QueryParser < Parslet::Parser
rule(:space) { match('\s').repeat(1) }
rule(:space?) { space.maybe }
rule(:delimiter) { space? >> str(',') >> space? }
rule(:select) { str('SELECT') >> space? }
rule(:select_value) { str('*') | match('[a-zA-Z]').repeat(1) }
rule(:select_arguments) do
space? >>
(select_value >> (delimiter >> select_value).repeat).maybe.as(:select) >>
space?
end
rule(:from) { str('FROM') >> space? }
rule(:from_arguments) { match('[a-zA-Z]').repeat(1).as(:from) >> space? }
rule(:query) { select >> select_arguments >> from >> from_arguments }
root(:query)
end
Where something like SELECT id,name,fork FROM forks correctly outputs the {:select=>"id,name,fork"#7, :from=>"forks"#25} tree.
Now, instead of messing around with this later, I would like to be able to convert the SELECT arguments (id,name,fork in this case) into an Array. I can do this by running 'id,name,fork'.split ','. I cannot get the Parslet transformer to do this for me when applied. This my code for my query transformer:
class QueryTransformer < Parslet::Transform
rule(select: simple(:args)) { args.split(',') }
end
When applied like so:
QueryTransformer.new.apply(
QueryParser.new.parse('SELECT id,name,fork FROM forks')
)
The result is the same as when I didn't apply it: {:select=>"id,name,fork"#7, :from=>"forks"#25}.
The value I was hoping :select to be is an Array like this ["id","name","fork"].
My question is: how do I split the value of :select into an Array using transformers?

You need to put "as(:xxx)" on whatever part of the parse tree you want to be able to play with later.
Here I changed your rule(:select_value) to remember the values as a :value
rule(:select_value) { (str('*') | match('[a-zA-Z]').repeat(1)).as(:value) }
Now your parser outputs :
{:select=>[{:value=>"id"#7}, {:value=>"name"#10}, {:value=>"fork"#15}], :from=>"forks"#25}
Which is easy to transform using:
class QueryTransformer < Parslet::Transform
rule(:value => simple(:val)) { val }
end
Then you get:
{:select=>["id"#7, "name"#10, "fork"#15], :from=>"forks"#25}
So in full the code is as follows :-
require 'parslet'
class QueryParser < Parslet::Parser
rule(:space) { match('\s').repeat(1) }
rule(:space?) { space.maybe }
rule(:delimiter) { space? >> str(',') >> space? }
rule(:select) { str('SELECT') >> space? }
rule(:select_value) { (str('*') | match('[a-zA-Z]').repeat(1)).as(:value) }
rule(:select_arguments) do
space? >>
(select_value >> (delimiter >> select_value).repeat).maybe.as(:select) >>
space?
end
rule(:from) { str('FROM') >> space? }
rule(:from_arguments) { match('[a-zA-Z]').repeat(1).as(:from) >> space? }
rule(:query) { select >> select_arguments >> from >> from_arguments }
root(:query)
end
puts QueryParser.new.parse('SELECT id,name,fork FROM forks')
# => {:select=>[{:value=>"id"#7}, {:value=>"name"#10}, {:value=>"fork"#15}], :from=>"forks"#25}
class QueryTransformer < Parslet::Transform
rule(:value => simple(:val)) { val }
end
puts QueryTransformer.new.apply(
QueryParser.new.parse('SELECT id,name,fork FROM forks')
)
# => {:select=>["id"#7, "name"#10, "fork"#15], :from=>"forks"#25}

Related

How do I use Parslet with strings not Parslet Slices

I've started using Parslet to parse some custom data. In the examples, the resulting parsed data is something like:
{ :custom_string => "data"#6 }
And I've created the Transform something like
rule(:custom_string => simple(:x)) { x.to_s }
But it doesn't match, presumably because I'm passing "data"#6 instead of just "data" which isn't just a simple string. All the examples for the Transform have hashes with strings, not with Parslet::Slices which is what the parser outputs. Maybe I'm missing a step but I can't see anything in the docs.
EDIT : More sample code (reduced version but should still be explanatory)
original_text = 'MSGSTART/DATA1/DATA2/0503/MAR'
require "parslet"
include Parslet
module ParseExample
class Parser < Parslet::Parser
rule(:fs) { str("/") }
rule(:newline) { str("\n") | str("\r\n") }
rule(:msgstart) { str("MSGSTART") }
rule(:data1) { match("\\w").repeat(1).as(:data1) }
rule(:data2) { match("\\w").repeat(1).as(:data2) }
rule(:serial_number) { match("\\w").repeat(1).as(:serial_number) }
rule(:month) { match("\\w").repeat(1).as(:month) }
rule(:first_line) { msgstart >> fs >> data1 >> fs >> data2 >> fs >> serial_number >> fs >> month >> newline }
rule(:document) { first_line >> newline.maybe }
root(:document)
end
end
module ParseExample
class Transformer < Parslet::Transform
rule(:data1 => simple(:x)) { x.to_s }
rule(:data2 => simple(:x)) { x.to_s }
rule(:serial_number => simple(:x)) { x.to_s }
rule(:month => simple(:x)) { x.to_s }
end
end
# Run by calling...
p = ParseExample::Parser.new
parse_result = p.parse(original_text)
# => {:data1=>"data1"#6, :data2=>"data2"#12, :serial_number=>"0503"#18, :month=>"MAR"#23}
t = ParseExample::Transformer.new
transformed = t.apply(parser_result)
# Actual result => {:data1=>"data1"#6, :data2=>"data2"#12, :serial_number=>"0503"#18, :month=>"MAR"#23}
# Expected result => {:data1=>"data1", :data2=>"data2", :serial_number=>"0503", :month=>"MAR"}
You can't replace individual key/value pairs. You have to replace the whole hash at once.
I fell for this the first time I wrote transformers too. The key is that transform rules match a whole node and replace it.. in it's entirity. Once a node has been matches it's not visited again.
If you did consume a hash and only match a single key/value pair, replacing it with a value... you just lost all the other key/value pairs in the same hash.
However... There is a way!
If you do want to pre-process all the nodes in a hash before matching the whole hash, the the hash's values need to be hashes themselves. Then you could match those and convert them to strings. You can usually do this by simply adding another 'as' in your parser.
For example:
original_text = 'MSGSTART/DATA1/DATA2/0503/MAR'
require "parslet"
include Parslet
module ParseExample
class Parser < Parslet::Parser
rule(:fs) { str("/") }
rule(:newline) { str("\n") | str("\r\n") }
rule(:msgstart) { str("MSGSTART") }
rule(:string) {match("\\w").repeat(1).as(:string)} # Notice the as!
rule(:data1) { string.as(:data1) }
rule(:data2) { string.as(:data2) }
rule(:serial_number) { string.as(:serial_number) }
rule(:month) { string.as(:month) }
rule(:first_line) {
msgstart >> fs >>
data1 >> fs >>
data2 >> fs >>
serial_number >> fs >>
month >> newline.maybe
}
rule(:document) { first_line >> newline.maybe }
root(:document)
end
end
# Run by calling...
p = ParseExample::Parser.new
parser_result = p.parse(original_text)
puts parser_result.inspect
# => {:data1=>{:string=>"DATA1"#9},
:data2=>{:string=>"DATA2"#15},
:serial_number=>{:string=>"0503"#21},
:month=>{:string=>"MAR"#26}}
# See how the values in the hash are now all hashes themselves.
module ParseExample
class Transformer < Parslet::Transform
rule(:string => simple(:x)) { x.to_s }
end
end
# We just need to match the "{:string => x}" hashes now...and replace them with strings
t = ParseExample::Transformer.new
transformed = t.apply(parser_result)
puts transformed.inspect
# => {:data1=>"DATA1", :data2=>"DATA2", :serial_number=>"0503", :month=>"MAR"}
# Tada!!!
If you had wanted to handle the whole line, do make an object from it.. say..
class Entry
def initialize(data1:, data2:, serial_number:,month:)
#data1 = data1
#data2 = data2
#serial_number = serial_number
#month = month
end
end
module ParseExample
class Transformer < Parslet::Transform
rule(:string => simple(:x)) { x.to_s }
# match the whole hash
rule(:data1 => simple(:d1),
:data2 => simple(:d2),
:serial_number => simple(:s),
:month => simple(:m)) {
Entry.new(data1: d1,data2: d2,serial_number: s,month: m)}
end
end
t = ParseExample::Transformer.new
transformed = t.apply(parser_result)
puts transformed.inspect
# => #<Entry:0x007fd5a3d26bf0 #data1="DATA1", #data2="DATA2", #serial_number="0503", #month="MAR">

Why this (dead simple) ruby regex behaves like this?

Why "whatever".gsub(/.*/, "bien") outputs "bienbien" instead of just "bien"?
I'm completely lost here :S Anyone could point me in the right direction?
You can see what's happening using a block:
>> 'foo'.sub(/.*/) { |m| p m; 'bar' }
"foo"
=> "bar"
>> 'foo'.gsub(/.*/) { |m| p m; 'bar' }
"foo"
""
=> "barbar"
>> 'foo'.gsub(/^.*/) { |m| p m; 'bar' }
"foo"
=> "bar"
>> 'foo'.gsub(/^.*$/) { |m| p m; 'bar' }
"foo"
=> "bar"
>> 'foo'.gsub(/.*$/) { |m| p m; 'bar' }
"foo"
""
=> "barbar"
>> 'foo'.gsub(/.+/) { |m| p m; 'bar' }
"foo"
=> "bar"
Put another way, gsub will continue matching, and matches an empty string at the very end a line. (And that is arguably a bug.)

Convert a string with brackets to tree, Ruby

I have a string "Animals ( Reptiles Birds ( Eagles Pigeons Crows ) )" and I need to return:
a = [
{
"Animals" => [
{
"Reptiles" => nil
},
{
"Birds" => [
{ "Eagles" => nil },
{ "Pigeons" => nil },
{ "Crows" => nil }
]
}
]
}
]
I don't understand how I can do it.
Where I can find some example or what I can search in google?
Here is a one way you could convert the string to an array.
Code
def arrayify(arr)
a = split_arr(arr)
a.map do |h|
k = h.keys.first
v = h.values.first
case v
when Array then { k => arrayify(v) }
else { k=>v }
end
end
end
def split_arr(arr)
a = []
while arr.any?
word = arr.shift
if arr.empty? || arr.first != ?(
a << { word=>nil }
else
arr.shift
close_count = 0
b = []
loop do
token = arr.shift
case token
when ?)
break if close_count == 0
close_count -= 1
when ?( then close_count += 1
end
b << token
end
a << { word=>b }
end
end
a
end
Example
str = "Animals ( Reptiles Birds ( Eagles Pigeons Crows ) ) Foods ( " +
"Snacks Breakfast ( Pancakes Waffles ) )"
arrayify(str.split)
#=> [{"Animals"=>[{"Reptiles" =>nil},
# {"Birds" =>[{"Eagles" =>nil},
# {"Pigeons"=>nil},
# {"Crows" =>nil}
# ]
# }
# ]
# },
# {"Foods" =>[{"Snacks" =>nil},
# {"Breakfast"=>[{"Pancakes"=>nil},
# {"Waffles" =>nil}
# ]
# }
# ]
# }
# ]
I don't understand how I can do it. Where I can find some example or what I can search in google?
Using a recursive regex is one option, especially if the parenthesis are properly balanced:
http://www.regular-expressions.info/recurse.html
If it's over your head, recursively plough through the string using a normal regex. Match something like:
[a-z]+ ?([^()]*)
... then replace the match with a place holder in the original string. Rinse, repeat.
Using a parser is an alternative option. You could write a simplistic one, or use a tool e.g.:
http://thingsaaronmade.com/blog/a-quick-intro-to-writing-a-parser-using-treetop.html
This works with your example, but I don't know how general it is.
Code
def arrayify(str)
eval('['+str.gsub(/(\w+)\s+\(/,'{\1=>[')
.gsub( /(?!\{)(\w+)\s+/, '{\1=>nil},')
.gsub(')', ']}')
.gsub(/\b(\w+)\b/,"\"\\1\"")+']')
end
Example
str = "Animals ( Reptiles Birds ( Eagles Pigeons Crows ) )"
arrayify(str)
#=> [{ "Animals"=>[{ "Reptiles"=>"nil"},
# { "Birds" =>[{ "Eagles" =>"nil" },
# { "Pigeons"=>"nil" },
# { "Crows" =>"nil" }
# ]
# }
# ]
# }
# ]
Explanation
s1 = str.gsub(/(\w+)\s+\(/,'{\1=>[')
#=> "{Animals=>[ Reptiles {Birds=>[ Eagles Pigeons Crows ) )"
s2 = s1.gsub( /(?!\{)(\w+)\s+/, '{\1=>nil},')
#=> "{Animals=>[ {Reptiles=>nil},{Birds=>[ {Eagles=>nil},{Pigeons=>nil},{Crows=>nil},) )"
s3 = s2.gsub(')', ']}')
#=> "{Animals=>[ {Reptiles=>nil},{Birds=>[ {Eagles=>nil},{Pigeons=>nil},{Crows=>nil},]} ]}"
s4 = s3.gsub(/\b(\w+)\b/,"\"\\1\"")
#=> "{\"Animals\"=>[ {\"Reptiles\"=>\"nil\"},{\"Birds\"=>[ {\"Eagles\"=>\"nil\"},{\"Pigeons\"=>\"nil\"},{\"Crows\"=>\"nil\"},]} ]}"
eval('['+s4+']')
#=> <result in example>
Pardon me, but I have to run. The eval police are coming.

How to add a postfix in incremental order to a printed string in Ruby?

For microarray data processing, I need to make a list of gene names from 1 to 654, like Gene_1 ... Gene_654.
My simple Ruby code produces the following:
1.upto(654).each { |i| print "Gene" }
The result is:
GeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGene
GeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGene
GeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGene
GeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGene
GeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGene
GeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGene
GeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGene
GeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGene
GeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGene
..................................
GeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGeneGene=> 1
irb(main):008:0>
How do I add a "postfix _#" in sequential incremental order to a printed string and put them in a column, like:
Gene_1
Gene_2
::::::
Gene_654
1.upto(654).each { |i| printf "%8s\t", "Gene_#{i}" }
Source: http://www.ruby-doc.org/core-2.0.0/Kernel.html#format-method
Edited to conform to the new requirements:
1.upto(654).each { |i| puts "Gene_#{i}" }
--output:--
Gene_1
Gene_2
...
Geen_654
I'd use:
str = 'Gene_0'
654.times { puts str.next! }
Which outputs:
Gene_1
...
Gene_654
If you need the text output to the same width, perhaps because you're going to append information to each line, use some formatting:
str = 'Gene_0'
654.times { puts '%8s ' % str.next! }
# >> Gene_1
...
# >> Gene_9
# >> Gene_10
...
# >> Gene_99
# >> Gene_100
...
# >> Gene_654
If you need columns across a page:
str = 'Gene_0'
654.times { print '%8s ' % str.next! }
puts
Which spaces them out in 8-space-wide columns.
By default %8s uses right alignment, which isn't always what you want. Instead you can use %-8s for left-alignment.
You can build an array containing the column headings:
str = 'Gene_0'
columns = []
654.times { columns << '%-8s' % str.next! }
puts columns.join(' ')
You could even use something like inject:
str = 'Gene_0'
columns = []
(1..654).inject(columns) { |a, i| a.push('%-8s' % str.next!) }
puts columns.join(' ')
But that starts to add code that doesn't really help.
The OP asked:
...how to add " " to the result...
The output above doesn't make it easy to see the whitespace automatically appended to the output by '%8s ', so I tweaked the format-string to make it more obvious by wrapping the output in double-quotes:
str = 'Gene_0'
654.times { puts '"%8s "' % str.next! }
And here's the corresponding output, trimmed down to show how the format string maintains the column width as the string value increments:
# >> " Gene_1 "
...
# >> " Gene_9 "
# >> " Gene_10 "
...
# >> " Gene_99 "
# >> "Gene_100 "
...
# >> "Gene_654 "
If you want all the white-space to occur at the end of the column, use a left-alignment:
str = 'Gene_0'
654.times { puts '"%-8s "' % str.next! }
Which outputs:
# >> "Gene_1 "
...
# >> "Gene_9 "
# >> "Gene_10 "
...
# >> "Gene_99 "
# >> "Gene_100 "
...
# >> "Gene_654 "

Indentation sensitive parser using Parslet in Ruby?

I am attempting to parse a simple indentation sensitive syntax using the Parslet library within Ruby.
The following is an example of the syntax I am attempting to parse:
level0child0
level0child1
level1child0
level1child1
level2child0
level1child2
The resulting tree would look like so:
[
{
:identifier => "level0child0",
:children => []
},
{
:identifier => "level0child1",
:children => [
{
:identifier => "level1child0",
:children => []
},
{
:identifier => "level1child1",
:children => [
{
:identifier => "level2child0",
:children => []
}
]
},
{
:identifier => "level1child2",
:children => []
},
]
}
]
The parser that I have now can parse nesting level 0 and 1 nodes, but cannot parse past that:
require 'parslet'
class IndentationSensitiveParser < Parslet::Parser
rule(:indent) { str(' ') }
rule(:newline) { str("\n") }
rule(:identifier) { match['A-Za-z0-9'].repeat.as(:identifier) }
rule(:node) { identifier >> newline >> (indent >> identifier >> newline.maybe).repeat.as(:children) }
rule(:document) { node.repeat }
root :document
end
require 'ap'
require 'pp'
begin
input = DATA.read
puts '', '----- input ----------------------------------------------------------------------', ''
ap input
tree = IndentationSensitiveParser.new.parse(input)
puts '', '----- tree -----------------------------------------------------------------------', ''
ap tree
rescue IndentationSensitiveParser::ParseFailed => failure
puts '', '----- error ----------------------------------------------------------------------', ''
puts failure.cause.ascii_tree
end
__END__
user
name
age
recipe
name
foo
bar
It's clear that I need a dynamic counter that expects 3 indentation nodes to match a identifier on the nesting level 3.
How can I implement an indentation sensitive syntax parser using Parslet in this way? Is it possible?
There are a few approaches.
Parse the document by recognising each line as a collection of indents and an identifier, then apply a transformation afterwards to reconstruct the hierarchy based on the number of indents.
Use captures to store the current indent and expect the next node to include that indent plus more to match as a child (I didn't dig into this approach much as the next one occurred to me)
Rules are just methods. So you can define 'node' as a method, which means you can pass parameters! (as follows)
This lets you define node(depth) in terms of node(depth+1). The problem with this approach, however, is that the node method doesn't match a string, it generates a parser. So a recursive call will never finish.
This is why dynamic exists. It returns a parser that isn't resolved until the point it tries to match it, allowing you to now recurse without problems.
See the following code:
require 'parslet'
class IndentationSensitiveParser < Parslet::Parser
def indent(depth)
str(' '*depth)
end
rule(:newline) { str("\n") }
rule(:identifier) { match['A-Za-z0-9'].repeat(1).as(:identifier) }
def node(depth)
indent(depth) >>
identifier >>
newline.maybe >>
(dynamic{|s,c| node(depth+1).repeat(0)}).as(:children)
end
rule(:document) { node(0).repeat }
root :document
end
This is my favoured solution.
I don't like the idea of weaving knowledge of the indentation process through the whole grammar. I would rather just have INDENT and DEDENT tokens produced that other rules could use similarly to just matching "{" and "}" characters. So the following is my solution. It is a class IndentParser that any parser can extend to get nl, indent, and decent tokens generated.
require 'parslet'
# Atoms returned from a dynamic that aren't meant to match anything.
class AlwaysMatch < Parslet::Atoms::Base
def try(source, context, consume_all)
succ("")
end
end
class NeverMatch < Parslet::Atoms::Base
attr_accessor :msg
def initialize(msg = "ignore")
self.msg = msg
end
def try(source, context, consume_all)
context.err(self, source, msg)
end
end
class ErrorMatch < Parslet::Atoms::Base
attr_accessor :msg
def initialize(msg)
self.msg = msg
end
def try(source, context, consume_all)
context.err(self, source, msg)
end
end
class IndentParser < Parslet::Parser
##
# Indentation handling: when matching a newline we check the following indentation. If
# that indicates an indent token or detent tokens (1+) then we stick these in a class
# variable and the high-priority indent/dedent rules will match as long as these
# remain. The nl rule consumes the indentation itself.
rule(:indent) { dynamic {|s,c|
if #indent.nil?
NeverMatch.new("Not an indent")
else
#indent = nil
AlwaysMatch.new
end
}}
rule(:dedent) { dynamic {|s,c|
if #dedents.nil? or #dedents.length == 0
NeverMatch.new("Not a dedent")
else
#dedents.pop
AlwaysMatch.new
end
}}
def checkIndentation(source, ctx)
# See if next line starts with indentation. If so, consume it and then process
# whether it is an indent or some number of dedents.
indent = ""
while source.matches?(Regexp.new("[ \t]"))
indent += source.consume(1).to_s #returns a Slice
end
if #indentStack.nil?
#indentStack = [""]
end
currentInd = #indentStack[-1]
return AlwaysMatch.new if currentInd == indent #no change, just match nl
if indent.start_with?(currentInd)
# Getting deeper
#indentStack << indent
#indent = indent #tells the indent rule to match one
return AlwaysMatch.new
else
# Either some number of de-dents or an error
# Find first match starting from back
count = 0
#indentStack.reverse.each do |level|
break if indent == level #found it,
if level.start_with?(indent)
# New indent is prefix, so we de-dented this level.
count += 1
next
end
# Not a match, not a valid prefix. So an error!
return ErrorMatch.new("Mismatched indentation level")
end
#dedents = [] if #dedents.nil?
count.times { #dedents << #indentStack.pop }
return AlwaysMatch.new
end
end
rule(:nl) { anynl >> dynamic {|source, ctx| checkIndentation(source,ctx) }}
rule(:unixnl) { str("\n") }
rule(:macnl) { str("\r") }
rule(:winnl) { str("\r\n") }
rule(:anynl) { unixnl | macnl | winnl }
end
I'm sure a lot can be improved, but this is what I've come up with so far.
Example usage:
class MyParser < IndentParser
rule(:colon) { str(':') >> space? }
rule(:space) { match(' \t').repeat(1) }
rule(:space?) { space.maybe }
rule(:number) { match['0-9'].repeat(1).as(:num) >> space? }
rule(:identifier) { match['a-zA-Z'] >> match["a-zA-Z0-9"].repeat(0) }
rule(:block) { colon >> nl >> indent >> stmt.repeat.as(:stmts) >> dedent }
rule(:stmt) { identifier.as(:id) >> nl | number.as(:num) >> nl | testblock }
rule(:testblock) { identifier.as(:name) >> block }
rule(:prgm) { testblock >> nl.repeat }
root :prgm
end

Resources