PyYAML loader with duplicate keys - yaml

Using PyYAML for loading a YAML (large) file which has duplicate keys. I would like to preserve all keys and would modify duplicate key according to project need. But it seems PyYAML is silently overwrites results with the last key and not getting a chance to modify it as my need (loss of information), resulting in this dict: {'blocks':{'a':'b2:11 c2:22'}}
simple example YAML:
import yaml
given_str = '''
blocks:
a:
b1:1
c1:2
a:
b2:11
c2:22'''
p = yaml.load(given_str)
How can I load the YAML with duplicate keys so that I get a chance to recursively traverse it and modify keys as my need. I need to load YAML and then transfer it into a database.

Assuming your input YAML has no merge keys ('<<'), no tags and no comments you want
to preserve, you can use the following:
import sys
import ruamel.yaml
from pathlib import Path
from collections.abc import Hashable
file_in = Path('input.yaml')
class MyConstructor(ruamel.yaml.constructor.SafeConstructor):
def construct_mapping(self, node, deep=False):
"""deep is True when creating an object/mapping recursively,
in that case want the underlying elements available during construction
"""
if not isinstance(node, ruamel.yaml.nodes.MappingNode):
raise ConstructorError(
None, None, f'expected a mapping node, but found {node.id!s}', node.start_mark,
)
total_mapping = self.yaml_base_dict_type()
if getattr(node, 'merge', None) is not None:
todo = [(node.merge, False), (node.value, False)]
else:
todo = [(node.value, True)]
for values, check in todo:
mapping: Dict[Any, Any] = self.yaml_base_dict_type()
for key_node, value_node in values:
# keys can be list -> deep
key = self.construct_object(key_node, deep=True)
# lists are not hashable, but tuples are
if not isinstance(key, Hashable):
if isinstance(key, list):
key = tuple(key)
if not isinstance(key, Hashable):
raise ConstructorError(
'while constructing a mapping',
node.start_mark,
'found unhashable key',
key_node.start_mark,
)
value = self.construct_object(value_node, deep=deep)
if key in mapping:
pat = key + '_undup_{}'
index = 0
while True:
nkey = pat.format(index)
if nkey not in mapping:
key = nkey
break
index += 1
mapping[key] = value
total_mapping.update(mapping)
return total_mapping
yaml = ruamel.yaml.YAML(typ='safe')
yaml.default_flow_style = False
yaml.Constructor = MyConstructor
data = yaml.load(file_in)
yaml.dump(data, sys.stdout)
which gives:
blocks:
a: b1:1 c1:2
a_undup_0: b2:11 c2:22
Please note that the values for both a keys are multiline plain scalars. For b1 and c1 to be a key
the mapping value indicator (:, the colon) needs to be followed by a whitespace character:
a:
b1: 1
c1: 2

After reading many forums, I think best solution is create a wrapper for yml loader (removing duplicates) is the solution. #Anthon - any comment?
import yaml
from collections import defaultdict, Counter
####### Preserving Duplicate ###################
def parse_preserving_duplicates(input_file):
class PreserveDuplicatesLoader(yaml.CLoader):
pass
def map_constructor(loader, node, deep=False):
"""Walk tree, removing degeneracy in any duplicate keys"""
keys = [loader.construct_object(node, deep=deep) for node, _ in node.value]
vals = [loader.construct_object(node, deep=deep) for _, node in node.value]
key_count = Counter(keys)
data = defaultdict(dict) # map all data removing duplicates
c = 0
for key, value in zip(keys, vals):
if key_count[key] > 1:
data[f'{key}{c}'] = value
c += 1
else:
data[key] = value
return data
PreserveDuplicatesLoader.add_constructor(yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
map_constructor)
return yaml.load(input_file, PreserveDuplicatesLoader)
##########################################################
with open(inputf, 'r') as file:
fp = parse_preserving_duplicates(input_file)

Related

YAML mapping order not preserved when using alias and yamlordereddictloader loader

I want to load a YAML file into Python as an OrderedDict. I am using yamlordereddictloader to preserve ordering.
However, I notice that the aliased object is placed "too soon" in the OrderedDict in the output.
How can I preserve the order of this mapping when read into Python, ideally as an OrderedDict? Is it possible to achieve this result without writing some custom parsing?
Notes:
I'm not particularly concerned with the method used, as long as the end result is the same.
Using sequences instead of mappings is problematic because they can result in nested output, and I can't simply flatten everything (some nestedness is appropriate).
When I try to just use !!omap, I cannot seem to merge the aliased mapping (d1.dt) into the d2 mapping.
I'm in Python 3.6, if I don't use this loader or !!omap order is not preserved (apparently contrary to the top 'Update' here: https://stackoverflow.com/a/21912744/2343633)
import yaml
import yamlordereddictloader
yaml_file = """
d1:
id:
nm1: val1
dt: &dt
nm2: val2
nm3: val3
d2: # expect nm4, nm2, nm3
nm4: val4
<<: *dt
"""
out = yaml.load(yaml_file, Loader=yamlordereddictloader.Loader)
keys = [x for x in out['d2']]
print(keys) # ['nm2', 'nm3', 'nm4']
assert keys==['nm4', 'nm2', 'nm3'], "order from YAML file is not preserved, aliased keys placed too early"
Is it possible to achieve this result without writing some custom parsing?
Yes. You need to override the method flatten_mapping from SafeConstructor. Here's a basic working example:
import yaml
import yamlordereddictloader
from yaml.constructor import *
from yaml.reader import *
from yaml.parser import *
from yaml.resolver import *
from yaml.composer import *
from yaml.scanner import *
from yaml.nodes import *
class MyLoader(yamlordereddictloader.Loader):
def __init__(self, stream):
yamlordereddictloader.Loader.__init__(self, stream)
# taken from here and reengineered to keep order:
# https://github.com/yaml/pyyaml/blob/5.3.1/lib/yaml/constructor.py#L207
def flatten_mapping(self, node):
merged = []
def merge_from(node):
if not isinstance(node, MappingNode):
raise yaml.ConstructorError("while constructing a mapping",
node.start_mark, "expected mapping for merging, but found %s" %
node.id, node.start_mark)
self.flatten_mapping(node)
merged.extend(node.value)
for index in range(len(node.value)):
key_node, value_node = node.value[index]
if key_node.tag == u'tag:yaml.org,2002:merge':
if isinstance(value_node, SequenceNode):
for subnode in value_node.value:
merge_from(subnode)
else:
merge_from(value_node)
else:
if key_node.tag == u'tag:yaml.org,2002:value':
key_node.tag = u'tag:yaml.org,2002:str'
merged.append((key_node, value_node))
node.value = merged
yaml_file = """
d1:
id:
nm1: val1
dt: &dt
nm2: val2
nm3: val3
d2: # expect nm4, nm2, nm3
nm4: val4
<<: *dt
"""
out = yaml.load(yaml_file, Loader=MyLoader)
keys = [x for x in out['d2']]
print(keys)
assert keys==['nm4', 'nm2', 'nm3'], "order from YAML file is not preserved, aliased keys placed too early"
This has not the best performance as it basically copies all key-value pairs from all mappings once each during loading, but it's working. Performance enhancement is left as an exercise for the reader :).

Human readable iterables in Sphinx documentation

Sphinx-autodoc flattens dicts, lists, and tuples - making long ones barely readable. Pretty-print format isn't always desired either, as some nested containers are better kept flattened than columned. Is there a way to display iterables as typed in source code?
Get it straight from source, and add an .rst command for it:
# conf.py
from importlib import import_module
from docutils import nodes
from sphinx import addnodes
from inspect import getsource
from docutils.parsers.rst import Directive
class PrettyPrintIterable(Directive):
required_arguments = 1
def run(self):
def _get_iter_source(src, varname):
# 1. identifies target iterable by variable name, (cannot be spaced)
# 2. determines iter source code start & end by tracking brackets
# 3. returns source code between found start & end
start = end = None
open_brackets = closed_brackets = 0
for i, line in enumerate(src):
if line.startswith(varname):
if start is None:
start = i
if start is not None:
open_brackets += sum(line.count(b) for b in "([{")
closed_brackets += sum(line.count(b) for b in ")]}")
if open_brackets > 0 and (open_brackets - closed_brackets == 0):
end = i + 1
break
return '\n'.join(src[start:end])
module_path, member_name = self.arguments[0].rsplit('.', 1)
src = getsource(import_module(module_path)).split('\n')
code = _get_iter_source(src, member_name)
literal = nodes.literal_block(code, code)
literal['language'] = 'python'
return [addnodes.desc_name(text=member_name),
addnodes.desc_content('', literal)]
def setup(app):
app.add_directive('pprint', PrettyPrintIterable)
Example .rst and result:
(:autodata: with empty :annotation: is to exclude the original flattened dictionary).
Some code borrowed from this answer.

YAML deserializer with position information?

Does anyone know of a YAML deserializer that can provide position information for the constructed objects?
I know how to deserialize a YAML file into a Java object. Simple instructions on http://yamlbeans.sourceforge.net/.
However, I want to do some algorithmic validation on the deserialized object and report error back to the user pointing to the position in the YAML that cause the error.
Example:
=========YAML file==========
name: Nathan Sweet
age: 28
address: 4011 16th Ave S
=======JAVA class======
public class Contact {
public String name;
public int age;
public String address;
}
Imagine if I want to first load the yaml into Contact class and then validate the address against some repository and error back if its invalid. Something like:
'Line 3 Column 9: The address does not match valid entry in the database'
The problem is, currently there is no way to get the position inside a deserialized object from YAML.
Anyone know a solution to this issue?
Most YAML parsers, if they keep any information about positions around they drop it while constructing the language native objects.
In ruamel.yaml ¹, I keep more information around because I want to be able to round-trip with minimal loss of original layout (e.g. keeping comments and key order in mappings).
I don't keep information on individual key-value pairs, but I do on the "upper-left" position of a mapping². Because of the kept order of the mapping items you can give some rather nice feedback. Given an input file:
- name: anthon
age: 53
adres: Rijn en Schiekade 105
- name: Nathan Sweet
age: 28
address: 4011 16th Ave S
And a program that you call with the input file as argument:
#! /usr/bin/env python2.7
# coding: utf-8
# http://stackoverflow.com/questions/30677517/yaml-deserializer-with-position-information?noredirect=1#comment49491314_30677517
import sys
import ruamel.yaml
up_arrow = '↑'
def key_error(key, value, line, col, error, e='E'):
print('E[{}:{}]: {}'.format(line, col, error))
print('{}{}: {}'.format(' '*col, key, value))
print('{}{}'.format(' '*(col), up_arrow))
print('---')
def value_error(key, value, line, col, error, e='E'):
val_col = col + len(key) + 2
print('{}[{}:{}]: {}'.format(e, line, val_col, error))
print('{}{}: {}'.format(' '*col, key, value))
print('{}{}'.format(' '*(val_col), up_arrow))
print('---')
def value_warning(key, value, line, col, error):
value_error(key, value, line, col, error, e='W')
class Contact(object):
def __init__(self, vals):
for offset, k in enumerate(vals):
self.check(k, vals[k], vals.lc.line+offset, vals.lc.col)
for k in ['name', 'address', 'age']:
if k not in vals:
print('K[{}:{}]: {}'.format(
vals.lc.line+offset, vals.lc.col, "missing key: "+k
))
print('---')
def check(self, key, value, line, col):
if key == 'name':
if value[0].lower() == value[0]:
value_error(key, value, line, col,
'value should start with uppercase')
elif key == 'age':
if value < 50:
value_warning(key, value, line, col,
'probably too young for knowing ALGOL 60')
elif key == 'address':
pass
else:
key_error(key, value, line, col,
"unexpected key")
data = ruamel.yaml.load(open(sys.argv[1]), Loader=ruamel.yaml.RoundTripLoader)
for x in data:
contact = Contact(x)
giving you E(rrors), W(arnings) and K(eys missing):
E[0:8]: value should start with uppercase
name: anthon
↑
---
E[2:2]: unexpected key
adres: Rijn en Schiekade 105
↑
---
K[2:2]: missing key: address
---
W[4:7]: probably too young for knowing ALGOL 60
age: 28
↑
---
Which you should be able to parser in a calling program in any language to give feedback. The check method of course need adjusting to your requirements. This is not as nice as being to do that in the language the rest of your application is in, but it might be better than nothing.
In my experience handling the above format is certainly simpler than extending an existing (open source) YAML parser.
¹ Disclaimer: I am the author of that package
² I want to use that kind of information at some point to preserve spurious newlines, inserted for readability
In python, you can readily write custom Dumper/Loader objects and use them to load (or dump) your yaml code. You can have these objects track the file/line info:
import yaml
from collections import OrderedDict
class YamlOrderedDict(OrderedDict):
"""
An OrderedDict that was loaded from a yaml file, and is annotated
with file/line info for reporting about errors in the source file
"""
def _annotate(self, node):
self._key_locs = {}
self._value_locs = {}
nodeiter = node.value.__iter__()
for key in self:
subnode = nodeiter.next()
self._key_locs[key] = subnode[0].start_mark.name + ':' + \
str(subnode[0].start_mark.line+1)
self._value_locs[key] = subnode[1].start_mark.name + ':' + \
str(subnode[1].start_mark.line+1)
def key_loc(self, key):
try:
return self._key_locs[key]
except AttributeError, KeyError:
return ''
def value_loc(self, key):
try:
return self._value_locs[key]
except AttributeError, KeyError:
return ''
# Use YamlOrderedDict objects for yaml maps instead of normal dict
yaml.add_representer(OrderedDict, lambda dumper, data:
dumper.represent_dict(data.iteritems()))
yaml.add_representer(YamlOrderedDict, lambda dumper, data:
dumper.represent_dict(data.iteritems()))
def _load_YamlOrderedDict(loader, node):
rv = YamlOrderedDict(loader.construct_pairs(node))
rv._annotate(node)
return rv
yaml.add_constructor(yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, _load_YamlOrderedDict)
Now when you read a yaml file, any mapping objects will be read as a YamlOrderedDict, which allows looking up the file location of keys in the mapping object. You can also add an iterator method like:
def iter_with_lines(self):
for key, val in self.items():
yield (key, val, self.key_loc(key))
...and now you can write a loop like:
for key,value,location in obj.iter_with_lines():
# iterate through the key/value pairs in a YamlOrderedDict, with
# the source file location

Extract keywords in a pystache template with gettext

I have html-template in my project that they are contain pystache codes such as
{{#_}}Word{{\_}}
I want to know , how I can extract this words by PoEditor parsers
You could use a regular expression to get them, and then remove what you don't want:
import re
regex=re.compile("\{\{\#\_\}\}.+\{\\\_\}\} ")
words=re.findall(regex, data)
#To remove it use re.split or simply now searching for [A-Z].
now I just using this code for making a file for using in PoEdit for scan and extracting the keywords:
def makeTempLang():
fs = getFiles('templates/')
words = []
regex =re.compile("\{\{\#\_\}\}(.+)\{\{/\_\}\}")
for f in fs:
data=open(f,'r').read()
fwords=re.findall(regex, data)
words.extend(fwords)
clean = (words[4:])
data='from core import config\n_=config.i18n\n'
for c in clean:
data = "%s_('%s')\n"%(data,c)
open('locale/temp2.py','w+').write(data)
pass
def getFiles(spath=''):
res =[]
arr = os.listdir(spath)
for d in arr:
dpath =os.path.join(spath,d)
if d.endswith('.htm'):
res.append(dpath)
if os.path.isdir(dpath):
sub=getFiles(dpath)
if len(sub) > 0 :
res.extend(sub)
return res

How to automatically turn BibTex citation into something parseable by Zotero?

I have a citation system which publishes users notes to a wiki (Researchr). Programmatically, I have access to the full BibTeX record of each entry, and I also display this on the individual pages (for example - click on BibTeX). This is in the interest of making it easy for users of other citation manager to automatically import the citation of a paper that interests them. I would also like other citation managers, especially Zotero, to be able to automatically detect and import a citation.
Zotero lists a number of ways of exposing metadata that it will understand, including meta tags with RDF, COiNS, Dublin Core and unAPI. Is there a Ruby library for converting BibTeX to any of these standards automatically - or a Javascript library? I could probably create something, but if something existed, it would be far more robust (BibTeX has so many publication types and fields etc).
There's a BibTeX2RDF convertor available here, might be what you're after.
unAPI is not a data standard - it's a way to serve data (to Zotero and other programs). Zotero imports Bibtex, so serving Bibtex via unAPI works just fine. Inspire is an example of a site that does that:
http://inspirehep.net/
By now one can simply import bibtex files of type .bib directly in Zotero. However, I noticed my bibtex files were often less complete than Zotero (in particular they often missed a DOI), and I did not find an "auto-complete" function (based on the data in the bibtex entries) in Zotero.
So I import the .bib file with Zotero, to ensure they are all in there. Then I run a python script that gets all the missing DOI's it can find for the entries in that .bib file, and exports them to a space separated .txt file.:
# pip install habanero
from habanero import Crossref
import re
def titletodoi(keyword):
cr = Crossref()
result = cr.works(query=keyword)
items = result["message"]["items"]
item_title = items[0]["title"]
tmp = ""
for it in item_title:
tmp += it
title = keyword.replace(" ", "").lower()
title = re.sub(r"\W", "", title)
# print('title: ' + title)
tmp = tmp.replace(" ", "").lower()
tmp = re.sub(r"\W", "", tmp)
# print('tmp: ' + tmp)
if title == tmp:
doi = items[0]["DOI"]
return doi
else:
return None
def get_dois(titles):
dois = []
for title in titles:
try:
doi = titletodoi(title)
print(f"doi={doi}, title={title}")
if not doi is None:
dois.append(doi)
except:
pass
# print("An exception occurred")
print(f"dois={dois}")
return dois
def read_titles_from_file(filepath):
with open(filepath) as f:
lines = f.read().splitlines()
split_lines = splits_lines(lines)
return split_lines
def splits_lines(lines):
split_lines = []
for line in lines:
new_lines = line.split(";")
for new_line in new_lines:
split_lines.append(new_line)
return split_lines
def write_dois_to_file(dois, filename, separation_char):
textfile = open(filename, "w")
for doi in dois:
textfile.write(doi + separation_char)
textfile.close()
filepath = "list_of_titles.txt"
titles = read_titles_from_file(filepath)
dois = get_dois(titles)
write_dois_to_file(dois, "dois_space.txt", " ")
write_dois_to_file(dois, "dois_per_line.txt", "\n")
The DOIs of the .txt are fed into magic wand of Zotero. Next, I (manually) remove the duplicates by choosing the latest added entry (because that comes from the magic wand with the most data).
After that, I run another script to update all the reference id's in my .tex and .bib files to those generated by Zotero:
# Importing library
import bibtexparser
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import *
import os, fnmatch
import Levenshtein as lev
# Let's define a function to customize our entries.
# It takes a record and return this record.
def customizations(record):
"""Use some functions delivered by the library
:param record: a record
:returns: -- customized record
"""
record = type(record)
record = author(record)
record = editor(record)
record = journal(record)
record = keyword(record)
record = link(record)
record = page_double_hyphen(record)
record = doi(record)
return record
def get_references(filepath):
with open(filepath) as bibtex_file:
parser = BibTexParser()
parser.customization = customizations
bib_database = bibtexparser.load(bibtex_file, parser=parser)
# print(bib_database.entries)
return bib_database
def get_reference_mapping(main_filepath, sub_filepath):
found_sub = []
found_main = []
main_into_sub = []
main_references = get_references(main_filepath)
sub_references = get_references(sub_filepath)
for main_entry in main_references.entries:
for sub_entry in sub_references.entries:
# Match the reference ID if 85% similair titles are detected
lev_ratio = lev.ratio(
remove_curly_braces(main_entry["title"]).lower(),
remove_curly_braces(sub_entry["title"]).lower(),
)
if lev_ratio > 0.85:
print(f"lev_ratio={lev_ratio}")
if main_entry["ID"] != sub_entry["ID"]:
print(f'replace: {sub_entry["ID"]} with: {main_entry["ID"]}')
main_into_sub.append([main_entry, sub_entry])
# Keep track of which entries have been found
found_sub.append(sub_entry)
found_main.append(main_entry)
return (
main_into_sub,
found_main,
found_sub,
main_references.entries,
sub_references.entries,
)
def remove_curly_braces(string):
left = string.replace("{", "")
right = left.replace("{", "")
return right
def replace_references(main_into_sub, directory):
for pair in main_into_sub:
main = pair[0]["ID"]
sub = pair[1]["ID"]
print(f"replace: {sub} with: {main}")
# UNCOMMENT IF YOU WANT TO ACTUALLY DO THE PRINTED REPLACEMENT
# findReplace(latex_root_dir, sub, main, "*.tex")
# findReplace(latex_root_dir, sub, main, "*.bib")
def findReplace(directory, find, replace, filePattern):
for path, dirs, files in os.walk(os.path.abspath(directory)):
for filename in fnmatch.filter(files, filePattern):
filepath = os.path.join(path, filename)
with open(filepath) as f:
s = f.read()
s = s.replace(find, replace)
with open(filepath, "w") as f:
f.write(s)
def list_missing(main_references, sub_references):
for sub in sub_references:
if not sub["ID"] in list(map(lambda x: x["ID"], main_references)):
print(f'the following reference has a changed title:{sub["ID"]}')
latex_root_dir = "some_path/"
main_filepath = f"{latex_root_dir}latex/Literature_study/zotero.bib"
sub_filepath = f"{latex_root_dir}latex/Literature_study/references.bib"
(
main_into_sub,
found_main,
found_sub,
main_references,
sub_references,
) = get_reference_mapping(main_filepath, sub_filepath)
replace_references(main_into_sub, latex_root_dir)
list_missing(main_references, sub_references)
# For those references which have levenshtein ratio below 85 you can specify a manual swap:
manual_swap = [] # main into sub
# manual_swap.append(["cantley_impact_2021","cantley2021impact"])
# manual_swap.append(["widemann_envision_2021","widemann2020envision"])
for pair in manual_swap:
main = pair[0]
sub = pair[1]
print(f"replace: {sub} with: {main}")
# UNCOMMENT IF YOU WANT TO ACTUALLY DO THE PRINTED REPLACEMENT
# findReplace(latex_root_dir, sub, main, "*.tex")
# findReplace(latex_root_dir, sub, main, "*.bib")

Resources