Ruby Git Diff Line Information Parser - ruby

How can I parse the output of a git diff and get line information (i.e. which lines has been added/modified)?
I would like something similar to
raw = `git diff`
parsed = Git.Diff.parse(raw)
parsed.each do |file|
file.each do |line|
puts "#{file.name} - #{line.number} - #{line.type}"
end
end
Edit:
Sample output
[
{
"file": "path/to/file1",
"lines": [
{ number: "1", type: "modified"},
{ number: "4", type: "deleted"},
{ number: "9", type: "added"}
]
},
{
"file": "path/to/file2",
"lines": [
{ number: "4", type: "modified"},
{ number: "5", type: "added"}
]
}
]

What you need is to correctly group the output in file chunks and keep what is needed.
Getting the diff
You can get it by simply running a
`git --diff`
What lines are needed?
lines starting with 'diff --git' from where you can get the file's name
lines starting with '+ ' that are the added ones
lines starting with '- ' that are the removed ones
How to group them?
For these things Enumerable#slice_before comes to mind.
Putting it together
I ended up with this prototype:
raw_data = `git diff`.split("\n")
# Keep what is needed
clean_data = raw_data.select { |li|
li.starts_with?('diff --git') ||
li.starts_with?('- ') ||
li.starts_with?('+ ')
}
# Group the by file
# [[file_1, line1, line2, line3], [file_2, line1]]
file_data = clean_data.slice_before { |li| li.starts_with?('diff --git') }
# This is the output format
output = Hash.new {|h,k| h[k] = { added: 0, removed: 0 } }
# Populate the output
file_data.each_with_object(output) do |f_data, memo|
file, *file_info = f_data
file = file.split(' b/').first.gsub('diff --git a/', '')
file_info.each { |f_info|
memo[file][f_info[0] == '+' ? :added : :removed] += 1
}
end
Output example
{
"file_1" => { added: 1, removed: 12 },
"file_2" => { added: 0, removed: 1 }
}
I am sure it can get better :-)

Here is what I ended up with
class Parser
def parse(text)
if text.encoding.name != "UTF-8"
encoded_text = #full_diff.encode("UTF-8", "binary", { :invalid => :replace, :undef => :replace })
else
encoded_text = text
end
hunks = []
hunk = nil
added_line_number = nil
deleted_line_number = nil
lines = encoded_text.strip.split("\n")
lines.each_with_index do |line, index|
if m = /^diff --git a\/(.*?) b\/(.*?)$/.match(line)
raise "Diff formatting error, 'diff --git' is the last line" if index + 1 >= lines.length
# new hunk
added_line_number = nil
delete_line_number = nil
hunk = Hunk.new(m[1], m[2])
hunk.type = hunk_type(lines[index + 1], m[1], m[2])
hunks.push(hunk)
elsif /^Binary files /.match(line)
hunk.is_binary = true
elsif m = /^## \-(\d+)(?:,\d+)? \+(\d+)(?:,\d+)? ##/.match(line)
# (e.g. ## -19,6 +19,7 ##)
deleted_line_number = Integer(m[1])
added_line_number = Integer(m[2])
else
if !added_line_number.nil?
if line.start_with?('+')
# added line
hunk.lines.push SourceLine.new(added_line_number, SourceLine::Type::Added, line[1..-1])
added_line_number += 1
elsif line.start_with?('-')
# deleted line
hunk.lines.push SourceLine.new(deleted_line_number, SourceLine::Type::Deleted, line[1..-1])
deleted_line_number += 1
else
# unmodified line
added_line_number += 1
deleted_line_number += 1
end
end
end
end
hunks
end
def hunk_type(line, original, renamed)
case line
when /^new file /
type = Hunk::Type::Added
when /^deleted file /
type = Hunk::Type::Deleted
else
type = original == renamed ? Hunk::Type::Modified : Hunk::Type::Renamed
end
type
end
private :hunk_type
end
end
module Type
Added = 'added'
Deleted = 'deleted'
Modified = 'modified'
Renamed = 'renamed'
end
class Hunk
module Type
Added = 'added'
Deleted = 'deleted'
Modified = 'modified'
Renamed = 'renamed'
end
attr_accessor :original_path, :renamed_path, :type, :lines, :is_binary
alias_method :is_binary?, :is_binary
def initialize(original_path, renamed_path)
self.is_binary = false
self.lines = []
self.original_path = original_path
self.renamed_path = renamed_path
end
end
class SourceLine
module Type
Added = 'added'
Deleted = 'deleted'
end
attr_accessor :number, :type, :text
def initialize(number, type, text)
self.number = number
self.type = type
self.text = text
end
end

You can try out https://github.com/bguban/git_modified_lines gem. It returns only modified lines but probably it will be useful

Related

Thread.critical not working in ruby 2.5.0

I'm using ruby2.5.0 and I have the below function as part of my script. When I run it I get the below error:
ensure in get_database_connection': undefined methodcritical=' for
Thread:Class (NoMethodError)
I understand the for ruby1.9.0 and above, Thread.critical is no longer supported so how can I edit my function to make it run under ruby2.5.0 ?
Thanks.
# Geodict
# Copyright (C) 2010 Pete Warden <pete#petewarden.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
require 'rubygems'
require 'postgres'
require 'set'
# Some hackiness to include the library script, even if invoked from another directory
require File.join(File.expand_path(File.dirname(__FILE__)), 'dstk_config')
# Global holder for the database connections
$connections = {}
# The main entry point. This function takes an unstructured text string and returns a list of all the
# fragments it could identify as locations, together with lat/lon positions
def find_locations_in_text(text)
current_index = text.length-1
result = []
$tokenized_words = {}
setup_countries_cache()
setup_regions_cache()
# This loop goes through the text string in *reverse* order. Since locations in English are typically
# described with the broadest category last, preceded by more and more specific designations towards
# the beginning, it simplifies things to walk the string in that direction too
while current_index>=0 do
current_word, pulled_index, ignored_skipped = pull_word_from_end(text, current_index)
lower_word = current_word.downcase
could_be_country = $countries_cache.has_key?(lower_word)
could_be_region = $regions_cache.has_key?(lower_word)
if not could_be_country and not could_be_region
current_index = pulled_index
next
end
# This holds the results of the match function for the final element of the sequence. This lets us
# optimize out repeated calls to see if the end of the current string is a country for example
match_cache = {}
token_result = nil
# These 'token sequences' describe patterns of discrete location elements that we'll look for.
$token_sequences.each() do |token_sequence|
# The sequences are specified in the order they'll occur in the text, but since we're walking
# backwards we need to reverse them and go through the sequence in that order too
token_sequence = token_sequence.reverse
# Now go through the sequence and see if we can match up all the tokens in it with parts of
# the string
token_result = nil
token_index = current_index
token_sequence.each_with_index do |token_name, token_position|
# The token definition describes how to recognize part of a string as a match. Typical
# tokens include country, city and region names
token_definition = $token_definitions[token_name]
match_function = token_definition[:match_function]
# This logic optimizes out repeated calls to the same match function
if token_position == 0 and match_cache.has_key?(token_name)
token_result = match_cache[token_name]
else
# The meat of the algorithm, checks the ending of the current string against the
# token testing function, eg seeing if it matches a country name
token_result = send(match_function, text, token_index, token_result)
if token_position == 0
match_cache[token_name] = token_result
end
end
if !token_result
# The string doesn't match this token, so the sequence as a whole isn't a match
break
else
# The current token did match, so move backwards through the string to the start of
# the matched portion, and see if the preceding words match the next required token
token_index = token_result[:found_tokens][0][:start_index]-1
end
end
# We got through the whole sequence and all the tokens match, so we have a winner!
if token_result
current_word, current_index, end_skipped = pull_word_from_end(text, current_index)
break
end
end
if !token_result
# None of the sequences matched, so back up a word and start over again
ignored_word, current_index, end_skipped = pull_word_from_end(text, current_index)
else
# We found a matching sequence, so add the information to the result
result.push(token_result)
found_tokens = token_result[:found_tokens]
current_index = found_tokens[0][:start_index]-1
end
end
# Reverse the result so it's in the order that the locations occured in the text
result.reverse!
return result
end
# Functions that look at a small portion of the text, and try to identify any location identifiers
# Caches the countries and regions tables in memory
$countries_cache = {}
$is_countries_cache_setup = false
def setup_countries_cache()
if $is_countries_cache_setup then return end
select = 'SELECT * FROM countries'
hashes = select_as_hashes(select, DSTKConfig::DATABASE)
hashes.each do |hash|
last_word = hash['last_word'].downcase
if !$countries_cache.has_key?(last_word)
$countries_cache[last_word] = []
end
$countries_cache[last_word].push(hash)
end
$is_countries_cache_setup = true
end
$regions_cache = {}
$is_regions_cache_setup = false
def setup_regions_cache()
if $is_regions_cache_setup then return end
select = 'SELECT * FROM regions'
hashes = select_as_hashes(select, DSTKConfig::DATABASE)
hashes.each do |hash|
last_word = hash['last_word'].downcase
if !$regions_cache.has_key?(last_word)
$regions_cache[last_word] = []
end
$regions_cache[last_word].push(hash)
end
$is_regions_cache_setup = true
end
# Translates a two-letter country code into a readable name
def get_country_name_from_code(country_code)
if !country_code then return nil end
setup_countries_cache()
result = country_code
$countries_cache.each do |last_word, countries|
countries.each do |row|
if row['country_code'] and row['country_code'].downcase == country_code.downcase
result = row['country']
end
end
end
result
end
# Matches the current fragment against our database of countries
def is_country(text, text_starting_index, previous_result)
current_word = ''
current_index = text_starting_index
pulled_word_count = 0
found_row = nil
# Walk backwards through the current fragment, pulling out words and seeing if they match
# the country names we know about
while pulled_word_count < DSTKConfig::WORD_MAX do
pulled_word, current_index, end_skipped = pull_word_from_end(text, current_index)
pulled_word_count += 1
if current_word == ''
# This is the first time through, so the full word is just the one we pulled
current_word = pulled_word
# Make a note of the real end of the word, ignoring any trailing whitespace
word_end_index = (text_starting_index-end_skipped)
# We've indexed the locations by the word they end with, so find all of them
# that have the current word as a suffix
last_word = pulled_word.downcase
if !$countries_cache.has_key?(last_word)
break
end
candidate_dicts = $countries_cache[last_word]
name_map = {}
candidate_dicts.each do |candidate_dict|
name = candidate_dict['country'].downcase
name_map[name] = candidate_dict
end
else
current_word = pulled_word+' '+current_word
end
# This happens if we've walked backwards all the way to the start of the string
if current_word == ''
return nil
end
# If the first letter of the name is lower case, then it can't be the start of a country
# Somewhat arbitrary, but for my purposes it's better to miss some ambiguous ones like this
# than to pull in erroneous words as countries (eg thinking the 'uk' in .co.uk is a country)
if current_word[0].chr =~ /[a-z]/
next
end
name_key = current_word.downcase
if name_map.has_key?(name_key)
found_row = name_map[name_key]
end
if found_row
# We've found a valid country name
break
end
if current_index < 0
# We've walked back to the start of the string
break
end
end
if !found_row
# We've walked backwards through the current words, and haven't found a good country match
return nil
end
# Were there any tokens found already in the sequence? Unlikely with countries, but for
# consistency's sake I'm leaving the logic in
if !previous_result
current_result = {
:found_tokens => [],
}
else
current_result = previous_result
end
country_code = found_row['country_code']
lat = found_row['lat']
lon = found_row['lon']
# Prepend all the information we've found out about this location to the start of the :found_tokens
# array in the result
current_result[:found_tokens].unshift({
:type => :COUNTRY,
:code => country_code,
:lat => lat,
:lon => lon,
:matched_string => current_word,
:start_index => (current_index+1),
:end_index => word_end_index
})
return current_result
end
# Looks through our database of 2 million towns and cities around the world to locate any that match the
# words at the end of the current text fragment
def is_city(text, text_starting_index, previous_result)
# If we're part of a sequence, then use any country or region information to narrow down our search
country_code = nil
region_code = nil
if previous_result
found_tokens = previous_result[:found_tokens]
found_tokens.each do |found_token|
type = found_token[:type]
if type == :COUNTRY:
country_code = found_token[:code]
elsif type == :REGION:
region_code = found_token[:code]
end
end
end
current_word = ''
current_index = text_starting_index
pulled_word_count = 0
found_row = nil
while pulled_word_count < DSTKConfig::WORD_MAX do
pulled_word, current_index, end_skipped = pull_word_from_end(text, current_index)
pulled_word_count += 1
if current_word == ''
current_word = pulled_word
word_end_index = (text_starting_index-end_skipped)
select = "SELECT * FROM cities WHERE last_word='"+pulled_word.downcase+"'"
if country_code
select += " AND country='"+country_code.downcase+"'"
end
if region_code
select += " AND region_code='"+region_code.upcase.strip+"'"
end
# There may be multiple cities with the same name, so pick the one with the largest population
select += ' ORDER BY population;'
hashes = select_as_hashes(select, DSTKConfig::DATABASE)
name_map = {}
hashes.each do |hash|
name = hash['city'].downcase
name_map[name] = hash
end
else
current_word = pulled_word+' '+current_word
end
if current_word == ''
return nil
end
if current_word[0].chr =~ /[a-z]/
next
end
name_key = current_word.downcase
if name_map.has_key?(name_key)
found_row = name_map[name_key]
end
if found_row
break
end
if current_index < 0
break
end
end
if !found_row
return nil
end
if !previous_result
current_result = {
:found_tokens => [],
}
else
current_result = previous_result
end
lat = found_row['lat']
lon = found_row['lon']
country_code = found_row['country'].downcase
current_result[:found_tokens].unshift( {
:type => :CITY,
:lat => lat,
:lon => lon,
:country_code => country_code,
:matched_string => current_word,
:start_index => (current_index+1),
:end_index => word_end_index
})
return current_result
end
# This looks for sub-regions within countries. At the moment the only values in the database are for US states
def is_region(text, text_starting_index, previous_result)
# Narrow down the search by country, if we already have it
country_code = nil
if previous_result
found_tokens = previous_result[:found_tokens]
found_tokens.each do |found_token|
type = found_token[:type]
if type == :COUNTRY
country_code = found_token[:code]
end
end
end
current_word = ''
current_index = text_starting_index
pulled_word_count = 0
found_row = nil
while pulled_word_count < DSTKConfig::WORD_MAX do
pulled_word, current_index, end_skipped = pull_word_from_end(text, current_index)
pulled_word_count += 1
if current_word == ''
current_word = pulled_word
word_end_index = (text_starting_index-end_skipped)
last_word = pulled_word.downcase
if !$regions_cache.has_key?(last_word)
break
end
all_candidate_dicts = $regions_cache[last_word]
if country_code
candidate_dicts = []
all_candidate_dicts.each do |possible_dict|
candidate_country = possible_dict['country_code']
if candidate_country.downcase() == country_code.downcase():
candidate_dicts << possible_dict
end
end
else
candidate_dicts = all_candidate_dicts
end
name_map = {}
candidate_dicts.each do |candidate_dict|
name = candidate_dict['region'].downcase
name_map[name] = candidate_dict
end
else
current_word = pulled_word+' '+current_word
end
if current_word == ''
return nil
end
if current_word[0].chr =~ /[a-z]/
next
end
name_key = current_word.downcase
if name_map.has_key?(name_key)
found_row = name_map[name_key]
end
if found_row
break
end
if current_index < 0
break
end
end
if !found_row
return nil
end
if !previous_result
current_result = {
:found_tokens => [],
}
else
current_result = previous_result
end
region_code = found_row['region_code']
lat = found_row['lat']
lon = found_row['lon']
country_code = found_row['country_code'].downcase
current_result[:found_tokens].unshift( {
:type => :REGION,
:code => region_code,
:lat => lat,
:lon => lon,
:country_code => country_code,
:matched_string => current_word,
:start_index => (current_index+1),
:end_index=> word_end_index
})
return current_result
end
# A special case - used to look for 'at' or 'in' before a possible location word. This helps me be more certain
# that it really is a location in this context. Think 'the New York Times' vs 'in New York' - with the latter
# fragment we can be pretty sure it's talking about a location
def is_location_word(text, text_starting_index, previous_result)
current_index = text_starting_index
current_word, current_index, end_skipped = pull_word_from_end(text, current_index)
word_end_index = (text_starting_index-end_skipped)
if current_word == ''
return nil
end
current_word.downcase!
if !DSTKConfig::LOCATION_WORDS.has_key?(current_word)
return nil
end
return previous_result
end
def is_postal_code(text, text_starting_index, previous_result)
# Narrow down the search by country, if we already have it
country_code = nil
if previous_result
found_tokens = previous_result[:found_tokens]
found_tokens.each do |found_token|
type = found_token[:type]
if type == :COUNTRY
country_code = found_token[:code]
end
end
end
current_word = ''
current_index = text_starting_index
pulled_word_count = 0
found_rows = nil
while pulled_word_count < DSTKConfig::WORD_MAX do
pulled_word, current_index, end_skipped = pull_word_from_end(text, current_index)
pulled_word_count += 1
if current_word == ''
current_word = pulled_word
word_end_index = (text_starting_index-end_skipped)
last_word = pulled_word.downcase
select = "SELECT * FROM postal_codes"
select += " WHERE last_word='"+pulled_word.downcase+"'"
select += " OR last_word='"+pulled_word.upcase+"'"
if country_code
select += " AND country_code='"+country_code.upcase+"'"
end
candidate_dicts = select_as_hashes(select, DSTKConfig::DATABASE)
name_map = {}
candidate_dicts.each do |candidate_dict|
name = candidate_dict['postal_code'].downcase
if !name_map[name] then name_map[name] = [] end
name_map[name] << candidate_dict
end
else
current_word = pulled_word+' '+current_word
end
if current_word == ''
return nil
end
if current_word[0].chr =~ /[a-z]/
next
end
name_key = current_word.downcase
if name_map.has_key?(name_key)
found_rows = name_map[name_key]
end
if found_rows
break
end
if current_index < 0
break
end
end
if !found_rows
return nil
end
# Confirm the postal code against the country suffix
found_row = nil
if country_code
found_rows.each do |row|
if row['country_code'] == country_code
found_row = row
break
end
end
end
if !found_row
return nil
end
# Also pull in the prefixed region, if there is one
region_result = is_region(text, current_index, nil)
if region_result
region_token = region_result[:found_tokens][0]
region_code = region_token[:code]
if found_row['region_code'] == region_code
current_index = region_token[:start_index]-1
current_word = region_token[:matched_string] + ' ' + current_word
end
end
if !found_row
return nil
end
if !previous_result
current_result = {
:found_tokens => [],
}
else
current_result = previous_result
end
region_code = found_row['region_code']
lat = found_row['lat']
lon = found_row['lon']
country_code = found_row['country_code'].downcase
region_code = found_row['region_code'].downcase
postal_code = found_row['postal_code'].downcase
current_result[:found_tokens].unshift( {
:type => :POSTAL_CODE,
:code => postal_code,
:lat => lat,
:lon => lon,
:region_code => region_code,
:country_code => country_code,
:matched_string => current_word,
:start_index => (current_index+1),
:end_index=> word_end_index
})
return current_result
end
# Characters to ignore when pulling out words
WHITESPACE = " \t'\",.-/\n\r<>!?".split(//).to_set
$tokenized_words = {}
# Walks backwards through the text from the end, pulling out a single unbroken sequence of non-whitespace
# characters, trimming any whitespace off the end
def pull_word_from_end(text, index, use_cache=true)
if use_cache and $tokenized_words.has_key?(index)
return $tokenized_words[index]
end
found_word = ''
current_index = index
end_skipped = 0
while current_index>=0 do
current_char = text[current_index].chr
current_index -= 1
if WHITESPACE.include?(current_char)
if found_word == ''
end_skipped += 1
next
else
current_index += 1
break
end
end
found_word << current_char
end
# reverse the result (since we're appending for efficiency's sake)
found_word.reverse!
result = [found_word, current_index, end_skipped]
$tokenized_words[index] = result
return result
end
# Converts the result of an SQL fetch into an associative dictionary, rather than a numerically indexed list
def get_hash_from_row(fields, row)
d = {}
fields.each_with_index do |field, index|
value = row[index]
d[field] = value
end
return d
end
# Returns the most specific token from the array
def get_most_specific_token(tokens)
if !tokens then return nil end
result = nil
result_priority = nil
tokens.each do |token|
priority = $token_priorities[token[:type]]
if !result or result_priority > priority
result = token
result_priority = priority
end
end
result
end
# Returns the results of the SQL select statement as associative arrays/hashes
def select_as_hashes(select, database_name)
begin
conn = get_database_connection(database_name)
Thread.critical = true
res = conn.exec('BEGIN')
res.clear
res = conn.exec('DECLARE myportal CURSOR FOR '+select)
res.clear
res = conn.exec('FETCH ALL in myportal')
fields = res.fields
rows = res.result
res = conn.exec('CLOSE myportal')
res = conn.exec('END')
result = []
rows.each do |row|
hash = get_hash_from_row(fields, row)
result.push(hash)
end
rescue PGError
if conn
printf(STDERR, conn.error)
else
$stderr.puts 'select_as_hashes() - no connection for ' + database_name
end
if conn
conn.close
end
$connections[database_name] = nil
exit(1)
ensure
Thread.critical = false
end
return result
end
def get_database_connection(database_name)
begin
Thread.critical = true
if !$connections[database_name]
$connections[database_name] = PGconn.connect(DSTKConfig::HOST,
DSTKConfig::PORT,
'',
'',
database_name,
DSTKConfig::USER,
DSTKConfig::PASSWORD)
end
ensure
Thread.critical = false
end
if !$connections[database_name]
$stderr.puts "get_database_connection('#{database_name}') - Couldn't open connection"
end
$connections[database_name]
end
# Types of locations we'll be looking for
$token_definitions = {
:COUNTRY => {
:match_function => :is_country
},
:CITY => {
:match_function => :is_city
},
:REGION => {
:match_function => :is_region
},
:LOCATION_WORD => {
:match_function => :is_location_word
},
:POSTAL_CODE => {
:match_function => :is_postal_code
}
}
# Particular sequences of those location words that give us more confidence they're actually describing
# a place in the text, and aren't coincidental names (eg 'New York Times')
$token_sequences = [
[ :POSTAL_CODE, :REGION, :COUNTRY ],
[ :REGION, :POSTAL_CODE, :COUNTRY ],
[ :POSTAL_CODE, :CITY, :COUNTRY ],
[ :POSTAL_CODE, :COUNTRY ],
[ :CITY, :COUNTRY ],
[ :CITY, :REGION ],
[ :REGION, :COUNTRY ],
[ :COUNTRY ],
[ :LOCATION_WORD, :REGION ], # Regions and cities are too common as words to use without additional evidence
[ :LOCATION_WORD, :CITY ]
]
# Location identifiers in order of decreasing specificity
$token_priorities = {
:POSTAL_CODE => 0,
:CITY => 1,
:REGION => 2,
:COUNTRY => 3,
}
if __FILE__ == $0
require 'json'
test_text = <<-TEXT
Spain
Italy
Bulgaria
Foofofofof
New Zealand
Barcelona, Spain
Wellington New Zealand
I've been working on the railroad, all the live-long day! The quick brown fox jumped over the lazy dog in Alabama
I'm mentioning Los Angeles here, but without California or CA right after it, it won't be detected. If I talk about living in Wisconsin on the other hand, that 'in' gives the algorithm extra evidence it's actually a location.
It should still pick up more qualified names like Amman Jordan or Atlanta, Georgia though!
Dallas, TX or New York, NY
It should now pick up Queensland, Australia, or even NSW, Australia!
Postal codes like QLD 4002, Australia, QC H3W, Canada, 2608 Lillehammer, Norway, or CA 94117, USA are supported too.
TEXT
puts "Analyzing '#{test_text}'"
puts "Found locations:"
locations = find_locations_in_text(test_text)
locations.each_with_index do |location_info, index|
found_tokens = location_info[:found_tokens]
location = get_most_specific_token(found_tokens)
match_start_index = found_tokens[0][:start_index]
match_end_index = found_tokens[found_tokens.length-1][:end_index]
matched_string = test_text[match_start_index..match_end_index]
result = {
'type' => location[:type],
'name' => location[:matched_string],
'latitude' => location[:lat].to_s,
'longitude' => location[:lon].to_s,
'start_index' => location[:start_index].to_s,
'end_index' => location[:end_index].to_s,
'matched_string' => matched_string,
'country' => location[:country_code],
'code' => location[:code],
}
puts result.to_json
end
end
You can either remove the call, or ask first before trying:
Thread.respond_to(:critical=) and Thread.critical = true
That being said, since Thread.critical= was removed in Ruby 1.9 it's pretty safe to trash that code entirely. Anyone running Ruby 1.8.x is living dangerously.
Unless you have a specific requirement to support 1.8.x, you'll have to delete the calls and use an alternative.
The purpose of critical= is to prevent pre-emption of the thread by another. That's a really heavy-handed way to synchronize threads, and dangerous enough that Ruby pulled support for it lest that start to become more pervasive.
What you probably want is a Mutex if you need to lock a resource. There's no obviously shared resources here unless get_database_connection returns one. It doesn't seem to as the connection is closed on error.
This code is full of some seriously suspect things, like using $connections, a global variable, and hard-exiting the whole process on failure. You may want to do a more thorough investigation as to what the purpose of the critical lock was in the first place.

How to Hash content to write in file as format mentioned as below?

I have wrote my ruby script for that. In that you can check "all_data" has all required content.
#!/usr/bin/env ruby
require 'docx'
file_data = []
name_file = "test"
t = ""
array_desc = []
heading_hash = {}
all_data = {}
temp = ""
output = ""
folder_name = ""
directory_name = ""
flag = true
count = 0
md_file_name = ''
Dir.glob("**/*.docx") do |file_name|
doc = Docx::Document.open(file_name)
first_table = doc.tables[0]
doc.tables.each do |table|
table.rows.each do |row| # Row-based iteration
row.cells.each_with_index do |cell, i|
if i == 2
file_data << cell.text.gsub('=','')
end
end
end
end
file_data.each_with_index do |l, d|
if l.include? file_data[d]
if ((l.strip)[0].to_i != 0)
md_file_name = file_data[d].split(".")
#start folder name
if flag
directory_name = md_file_name[0].to_i
flag = false
end
count +=1
t = file_data[d+1]
if(array_desc.size > 0)
heading_hash[temp] = array_desc
all_data[md_file_name[0].strip] = heading_hash
array_desc = []
end
else
if(t != l)
array_desc << l
temp = t
end
end
end
end
if(array_desc.size> 0)
heading_hash[temp] = array_desc
all_data[md_file_name[0].strip] = heading_hash
array_desc = []
end
all_data.each do |k, v|
v.each do |(hk, hv)|
if hk != ""
chapter_no = k
if (k[0,1] == 0.to_s)
chapter_no = k
else
chapter_no = "0#{k}"
end
Dir.mkdir("#{chapter_no}") unless File.exists?("#{chapter_no}")
output_name = "#{chapter_no}/#{File.basename("01", '.*')}.md"
output = File.open(output_name, 'w')
# output << "#"+"#{hk}\n\n"
# output << "#{hv} \n\n"
hv.each do |des|
# puts des
end
end
end
end
end
source docx file
download above file and put sctip and docx (source file) in same folder. When you will run script form terminal ($./script.rb) you will see folder name as 01,02.....etc. And inside there will be file with md extension.
I want to output as below description:
## FOLDER 01 > FILE 01.md, here data in file like hk as heading (for Heading you can put # before hk)and hv
## FOLDER 02 > FILE 01.md, here data in file like hk as heading (for Heading you can put # before hk)and hv
Please use my code and check that is working or not.
Dir.glob("**/*.docx") do |file_name|
doc = Docx::Document.open(file_name)
first_table = doc.tables[0]
doc.tables.each do |table|
table.rows.each do |row|
row.cells.each_with_index do |cell, i|
if i == 2
file_data << cell.text.gsub('=','')
end
end
end
end
file_data.each_with_index do |l, d|
if ((l.strip)[0].to_i != 0)
md_file_name = file_data[d].split(".")
#start folder name
if flag
directory_name = md_file_name[0].to_i
flag = false
end
count +=1
t = file_data[d+1]
if(array_desc.size > 0)
heading_hash[temp] = array_desc
array_desc=[]
all_data[file_data[d+1]] = array_desc
end
else
if(t != l)
array_desc << l
temp = t
end
end
end
chapter_no = 1
all_data.each do |k, v|
Dir.mkdir("#{chapter_no}") unless File.exists?("#{chapter_no}")
output_name = "#{chapter_no}/#{File.basename("01", '.*')}.md"
output = File.open(output_name, 'a')
output << "#"+"#{k}\n\n"
v.each do |d|
output << "#{d} \n"
end
chapter_no= chapter_no+1
end
end
It will give exact output as you shared above. Let me know if you need more help.

How to merge multiple hashes?

Right now, I'm merging two hashes like this:
department_hash = self.parse_department html
super_saver_hash = self.parse_super_saver html
final_hash = department_hash.merge(super_saver_hash)
Output:
{:department=>{"Pet Supplies"=>{"Birds"=>16281, "Cats"=>245512,
"Dogs"=>513926, "Fish & Aquatic Pets"=>46811, "Horses"=>14805,
"Insects"=>364, "Reptiles & Amphibians"=>5816, "Small
Animals"=>19769}}, :super_saver=>{"Free Super Saver
Shipping"=>126649}}
But now I want to merge more in the future. For example:
department_hash = self.parse_department html
super_saver_hash = self.parse_super_saver html
categories_hash = self.parse_categories html
How to merge multiple hashes?
How about:
[department_hash, super_saver_hash, categories_hash].reduce &:merge
You can just call merge again:
h1 = {foo: :bar}
h2 = {baz: :qux}
h3 = {quux: :garply}
h1.merge(h2).merge(h3)
#=> {:foo=>:bar, :baz=>:qux, :quux=>:garply}
You can do below way using Enumerable#inject:
h = {}
arr = [{:a=>"b"},{"c" => 2},{:a=>4,"c"=>"Hi"}]
arr.inject(h,:update)
# => {:a=>4, "c"=>"Hi"}
arr.inject(:update)
# => {:a=>4, "c"=>"Hi"}
It took me a while to figure out how to merge multi-nested hashes after going through this Question and its Answers. It turned out I was iterating through the collections of hashes incorrectly, causing all kinds of problems with null values.
This sample command-line app shows how to merge multiple hashes with a combination of store and merge!, depending on whether or not they were top-level hash keys. It uses command-line args with a few known key name for categorization purposes.
Full code from the Gist URL is provided below as a courtesy:
# Ruby - A nested hash example
# Load each pair of args on the command-line as a key-value pair
# For example from CMD.exe:
# call ruby.exe ruby_nested_hash_example.rb Age 30 Name Mary Fav_Hobby Ataraxia Fav_Number 42
# Output would be:
# {
# "data_info": {
# "types": {
# "nums": {
# "Age": 30,
# "Fav_Number": 42
# },
# "strings": {
# "Name": "Mary",
# "Fav_Hobby": "Ataraxia"
# }
# },
# "data_id": "13435436457"
# }
# }
if (ARGV.count % 2 != 0) || (ARGV.count < 2)
STDERR.puts "You must provide an even amount of command-line args to make key-value pairs.\n"
abort
end
require 'json'
cmd_hashes = {}
nums = {}
strings = {}
types = {}
#FYI `tl` == top-level
all_tl_keys = {}
data_info = {}
data_id = {:data_id => "13435436457"}
_key = ""
_value = ""
element = 0
ARGV.each do |i|
if element % 2 == 0
_key=i
else
if (i.to_i!=0) && (i!=0)
_value=i.to_i
else
_value=i
end
end
if (_key != "") && (_value != "")
cmd_hashes.store(_key, _value)
_key = ""
_value = ""
end
element+=1
end
cmd_hashes.each do |key, value|
if value.is_a? Numeric
nums.store(key, value)
else
strings.store(key, value)
end
end
if nums.size > 0; types.merge!(:nums => nums) end
if strings.size > 0; types.merge!(:strings => strings) end
if types.size > 0; all_tl_keys.merge!(:types => types) end
if data_id.size > 0; all_tl_keys.merge!(data_id) end
if all_tl_keys.size > 0; data_info.merge!(:data_info => all_tl_keys) end
if data_info.size > 0; puts JSON.pretty_generate(data_info) end
Suppose you are having arr = [{x: 10},{y: 20},{z: 30}]
then do
arr.reduce(:merge)

Parse CSV Data with Ruby

I am trying to return a specific cell value based on two criteria.
The logic:
If ClientID = 1 and BranchID = 1, puts SurveyID
Using Ruby 1.9.3, I want to basically look through an excel file and for two specific values located within the ClientID and BranchID column, return the corresponding value in the SurveyID column.
This is what I have so far, which I found during my online searches. It seemed promising, but no luck:
require 'csv'
# Load file
csv_fname = 'FS_Email_Test.csv'
# Key is the column to check, value is what to match
search_criteria = { 'ClientID' => '1',
'BranchID' => '1' }
options = { :headers => :first_row,
:converters => [ :numeric ] }
# Save `matches` and a copy of the `headers`
matches = nil
headers = nil
# Iterate through the `csv` file and locate where
# data matches the options.
CSV.open( csv_fname, "r", options ) do |csv|
matches = csv.find_all do |row|
match = true
search_criteria.keys.each do |key|
match = match && ( row[key] == search_criteria[key] )
end
match
end
headers = csv.headers
end
# Once matches are found, we print the results
# for a specific row. The row `row[8]` is
# tied specifically to a notes field.
matches.each do |row|
row = row[1]
puts row
end
I know the last bit of code following matches.each do |row| is invalid, but I left it in in hopes that it will make sense to someone else.
How can I write puts surveyID if ClientID == 1 & BranchID == 1?
You were very close indeed. Your only error was setting the values of the search_criteria hash to strings '1' instead of numbers. Since you have converters: :numeric in there the find_all was comparing 1 to '1' and getting false. You could just change that and you're done.
Alternatively this should work for you.
The key is the line
Hash[row].select { |k,v| search_criteria[k] } == search_criteria
Hash[row] converts the row into a hash instead of an array of arrays. Select generates a new hash that has only those elements that appear in search_criteria. Then just compare the two hashes to see if they're the same.
require 'csv'
# Load file
csv_fname = 'FS_Email_Test.csv'
# Key is the column to check, value is what to match
search_criteria = {
'ClientID' => 1,
'BranchID' => 1,
}
options = {
headers: :first_row,
converters: :numeric,
}
# Save `matches` and a copy of the `headers`
matches = nil
headers = nil
# Iterate through the `csv` file and locate where
# data matches the options.
CSV.open(csv_fname, 'r', options) do |csv|
matches = csv.find_all do |row|
Hash[row].select { |k,v| search_criteria[k] } == search_criteria
end
headers = csv.headers
end
p headers
# Once matches are found, we print the results
# for a specific row. The row `row[8]` is
# tied specifically to a notes field.
matches.each { |row| puts row['surveyID'] }
Possibly...
require 'csv'
b_headers = false
client_id_col = 0
branch_id_col = 0
survey_id_col = 0
CSV.open('FS_Email_Test.csv') do |file|
file.find_all do |row|
if b_headers == false then
client_id_col = row.index("ClientID")
branch_id_col = row.index("BranchID")
survey_id_col = row.index("SurveyID")
b_headers = true
if branch_id_col.nil? || client_id_col.nil? || survey_id_col.nil? then
puts "Invalid csv file - Missing one of these columns (or no headers):\nClientID\nBranchID\nSurveyID"
break
end
else
puts row[survey_id_col] if row[branch_id_col] == "1" && row[client_id_col] == "1"
end
end
end

How to read an INI file in ruby

How do I read/write an ini file in ruby. I have an ini file that I need to
read
change an entry
write out to a different location
How would I do that in ruby? The documentation on this is bleak.
Use the InIFile Gem
As #method said, use the inifile gem. There is also an ini gem but I haven't used it.
I found the documentation here a slightly more helpful than the documentation here which is where the gem page links to.
There were not many examples so here is a bit of code to get you started:
Example Setup
First, create a file /tmp/desktop.ini with these contents:
[Desktop Entry]
Version=1.0
Type=Application
Name=Foo Viewer
Comment=The best viewer for Foo objects available!
TryExec=fooview
Exec=fooview %F
Icon=fooview
Make sure you have run gem install inifile from the command line.
Example Code
Create a file like /tmp/ini-test.rb with these contents:
require 'inifile'
require 'pp'
# read an existing file
file = IniFile.load('/tmp/desktop.ini')
data = file["Desktop Entry"]
#output one property
puts "here is one property:"
puts data["Name"]
# pretty print object
puts "here is the loaded file:"
pp file
# create a new ini file object
new_file = IniFile.new
# set properties
new_file["Desktop Entry"] = {
"Type" => "Application",
"Name" => 'test',
"Exec" => 'command',
}
# pretty print object
puts "here is a object created with new:"
pp new_file
# set file path
new_file.filename = "/tmp/new_ini_file.ini"
# save file
new_file.write()
puts "the new object has been saved as a file to /tmp/new_ini_file.ini"
Example Results
Running that file with ruby /tmp/ini-test.rb should yield something like:
here is one property:
Foo Viewer
here is the loaded file:
{ this output hidden for brevity }
here is a object created with new:
#<IniFile:0x007feeec000770
#comment=";#",
#content=nil,
#default="global",
#encoding=nil,
#escape=true,
#filename=nil,
#ini=
{"Desktop Entry"=>
{"Type"=>"Application",
"Name"=>"test",
"Exec"=>"command",
"Icon"=>"icon_filename",
"Comment"=>"comment"}},
#param="=">
the new object has been saved as a file to /tmp/new_ini_file.ini
Modify as required suit your needs.
I recently used ruby-inifile. Maybe it's overkill compared to the simple snippets here...
Here's the module for reading and writing of .ini-files with as less change to original file as possible (for files which read humans and machines):
class IniFileExc < RuntimeError
end
class IniNode
def initialize(name, value=nil)
#line_start = -1;
#line_end = -1;
#level = 0;
#name = name;
#value = value;
#keys = {};
#keylist = [];
#modified = false;
#deleted = false;
end
attr_reader :level,:line_start,:line_end,:name,:value,:keylist,:keys,:modified,:deleted
attr_writer :level,:line_start,:line_end,:name,:value,:keylist,:keys,:modified,:deleted
def to_str
return #name.to_s + ' = ' + #value.to_s;
end
def to_s
return #value.to_s;
end
def to_i
return #value.to_i
end
def to_f
return #value.to_f;
end
def
insert(key, nil);
return #keys[key];
end
def insert(key, value)
return false if (#keys.has_key?(key));
node = nil;
if (value && ((value.class == IniNode) || (value.class == IniSection)))
node = value;
else
if (#level <= 0)
node = IniSection.new(key);
else
node = IniNode.new(key, value)
end
end
node.line_start = #line_end + 1 if (node.line_start < 0);
node.level = #level + 1;
#keys[key] = node;
#keylist.push(key);
return true;
end
def []=(key, value)
rc = insert(key, value);
#keys[key].value = value;
#keys[key].modified = true;
#modified = true;
end
def delete(key)
return false if (! #keys.has_key?(key));
#keys[key].deleted = true;
#modified = true;
end
end
class IniSection < IniNode
def initialize(name)
super(name);
end
def to_str
return ('[' + #name + ']');
end
end
class IniFile < IniNode
def initialize(path, load=true)
super(path);
#lines = [];
reload() if (load);
end
def reload
begin
input = File.new(#name, "r");
rescue
raise;
else
prevnode = node = self;
lineno = 0;
input.each do |line|
#lines.push(line);
parsed_node = parse_line(lineno, line);
if (parsed_node);
if (parsed_node.class == IniSection)
if (parsed_node != node)
prev_node = node;
node = parsed_node;
insert(node.name, node);
prev_node.line_end = lineno - 1;
end
else
node.insert(parsed_node.name, parsed_node);
end
end
lineno += 1;
end
input.close;
node.line_end = #line_end = lineno - 1;
end
end
def parse_line(lineno, line)
return nil if (line =~ /^\s*$/);
return nil if (line =~ /^\s*#/);
return nil if (line =~ /^\s*;/);
if (line =~ /^\s*[\s*(.+)\s*].$/)
rv = IniSection.new($1);
rv.line_start = lineno;
rv.level = #level + 1;
return rv;
elsif (line =~ /^\s(\S?.[^=\s])\s=\s*(\S?[^#;][^#;\s\n]).$/)
rv = IniNode.new($1, $2);
rv.line_start = rv.line_end = lineno;
rv.level = #level + 2;
return rv;
end
return nil;
end
def write
inserted = {};
#keylist.each do |sect|
sectnode = #keys[sect];
next if (!sectnode.modified || sectnode.deleted);
if (sectnode.line_end < 0)
#lines.push("\n");
#lines.push(sectnode.to_str + "\n");
end
sectnode.keylist.each do |key|
keynode = sectnode.keys[key];
next if (!keynode.modified || keynode.deleted);
if (keynode.line_end < 0)
if (sectnode.line_end < 0)
#lines.push(keynode.to_str + "\n");
else
idx = sectnode.line_end.to_i;
inserted[idx] = [] if (! inserted.has_key?(idx));
inserted[idx].push(keynode.to_str);
end
else
line = #lines[keynode.line_start];
if (line =~ /^(\s*)(\S?.[^=\s]\s=\s*\S?.+[^#;\s])(\s*[#;].)$/)
line = $1 + keynode.to_str + $3 + "\n";
else
line = line.gsub(/^(\s)(\S?.[^=\s]\s=\s*\S?[^#;]+[^#;\n\s])(.*)$/){
$1 + keynode.to_str + $3};
end
#lines[keynode.line_start] = line;
end
end
end
deleted = {};
#keylist.each do |sect|
sectnode = #keys[sect];
next if (!sectnode.deleted && !sectnode.modified);
if (sectnode.deleted && (sectnode.line_start >= 0) && (sectnode.line_end >= 0) \
&& (sectnode.line_end >= sectnode.line_start))
for i in sectnode.line_start..sectnode.line_end
deleted[i] = true;
end
end
sectnode.keylist.each do |key|
keynode = sectnode.keys[key];
next if (!keynode.deleted);
deleted[keynode.line_start.to_i] = true \
if ((keynode.line_start >= 0) && (keynode.line_end >= 0) && (keynode.line_start == keynode.line_end));
end
end
begin
file = File.new(#name, 'w');
rescue
raise(IniFileExc, "Failed to open " + #name + " for writing: #{$!}", caller);
else
cnt = -1;
#lines.each do |line|
cnt += 1;
if (inserted.has_key?(cnt))
inserted[cnt].each do |ins|
file.puts(ins + "\n");
end
end
next if (deleted[cnt]);
file.puts(line);
end
file.close;
end
end
end
Usage example:
begin
ini = IniFile.new('file.ini');
ini['common']['param'] = 'value';
ini['common'].delete('unused_param');
ini.delete('unused_section');
print "Valuable value: ", ini['common']['param'], "\n";
ini.write;
rescue IniFileExc
print "Oh, that's not good: ", $!, "\n";
end
Hope this helps.
Here's another option:
http://rubygems.org/gems/ini
If I understand correctly,
outFile = File.new('out.ini', 'w')
File.open('in.ini', 'r') do |inFile|
inFile.each_line do |line|
# foo is the entry you want to change, baz is its new value.
outFile.puts(line.sub(/foo=(.*)/, 'foo=baz'))
end
end
outFile.close
Note that when you use File.open with a block, the file will automatically be closed when the block terminates.
file = File.new("your.ini", "r")
while (line = file.gets)
puts "#{line}" #additionally make changes
end
file.close

Resources