Extract keywords in a pystache template with gettext - internationalization

I have html-template in my project that they are contain pystache codes such as
{{#_}}Word{{\_}}
I want to know , how I can extract this words by PoEditor parsers

You could use a regular expression to get them, and then remove what you don't want:
import re
regex=re.compile("\{\{\#\_\}\}.+\{\\\_\}\} ")
words=re.findall(regex, data)
#To remove it use re.split or simply now searching for [A-Z].

now I just using this code for making a file for using in PoEdit for scan and extracting the keywords:
def makeTempLang():
fs = getFiles('templates/')
words = []
regex =re.compile("\{\{\#\_\}\}(.+)\{\{/\_\}\}")
for f in fs:
data=open(f,'r').read()
fwords=re.findall(regex, data)
words.extend(fwords)
clean = (words[4:])
data='from core import config\n_=config.i18n\n'
for c in clean:
data = "%s_('%s')\n"%(data,c)
open('locale/temp2.py','w+').write(data)
pass
def getFiles(spath=''):
res =[]
arr = os.listdir(spath)
for d in arr:
dpath =os.path.join(spath,d)
if d.endswith('.htm'):
res.append(dpath)
if os.path.isdir(dpath):
sub=getFiles(dpath)
if len(sub) > 0 :
res.extend(sub)
return res

Related

PyYAML loader with duplicate keys

Using PyYAML for loading a YAML (large) file which has duplicate keys. I would like to preserve all keys and would modify duplicate key according to project need. But it seems PyYAML is silently overwrites results with the last key and not getting a chance to modify it as my need (loss of information), resulting in this dict: {'blocks':{'a':'b2:11 c2:22'}}
simple example YAML:
import yaml
given_str = '''
blocks:
a:
b1:1
c1:2
a:
b2:11
c2:22'''
p = yaml.load(given_str)
How can I load the YAML with duplicate keys so that I get a chance to recursively traverse it and modify keys as my need. I need to load YAML and then transfer it into a database.
Assuming your input YAML has no merge keys ('<<'), no tags and no comments you want
to preserve, you can use the following:
import sys
import ruamel.yaml
from pathlib import Path
from collections.abc import Hashable
file_in = Path('input.yaml')
class MyConstructor(ruamel.yaml.constructor.SafeConstructor):
def construct_mapping(self, node, deep=False):
"""deep is True when creating an object/mapping recursively,
in that case want the underlying elements available during construction
"""
if not isinstance(node, ruamel.yaml.nodes.MappingNode):
raise ConstructorError(
None, None, f'expected a mapping node, but found {node.id!s}', node.start_mark,
)
total_mapping = self.yaml_base_dict_type()
if getattr(node, 'merge', None) is not None:
todo = [(node.merge, False), (node.value, False)]
else:
todo = [(node.value, True)]
for values, check in todo:
mapping: Dict[Any, Any] = self.yaml_base_dict_type()
for key_node, value_node in values:
# keys can be list -> deep
key = self.construct_object(key_node, deep=True)
# lists are not hashable, but tuples are
if not isinstance(key, Hashable):
if isinstance(key, list):
key = tuple(key)
if not isinstance(key, Hashable):
raise ConstructorError(
'while constructing a mapping',
node.start_mark,
'found unhashable key',
key_node.start_mark,
)
value = self.construct_object(value_node, deep=deep)
if key in mapping:
pat = key + '_undup_{}'
index = 0
while True:
nkey = pat.format(index)
if nkey not in mapping:
key = nkey
break
index += 1
mapping[key] = value
total_mapping.update(mapping)
return total_mapping
yaml = ruamel.yaml.YAML(typ='safe')
yaml.default_flow_style = False
yaml.Constructor = MyConstructor
data = yaml.load(file_in)
yaml.dump(data, sys.stdout)
which gives:
blocks:
a: b1:1 c1:2
a_undup_0: b2:11 c2:22
Please note that the values for both a keys are multiline plain scalars. For b1 and c1 to be a key
the mapping value indicator (:, the colon) needs to be followed by a whitespace character:
a:
b1: 1
c1: 2
After reading many forums, I think best solution is create a wrapper for yml loader (removing duplicates) is the solution. #Anthon - any comment?
import yaml
from collections import defaultdict, Counter
####### Preserving Duplicate ###################
def parse_preserving_duplicates(input_file):
class PreserveDuplicatesLoader(yaml.CLoader):
pass
def map_constructor(loader, node, deep=False):
"""Walk tree, removing degeneracy in any duplicate keys"""
keys = [loader.construct_object(node, deep=deep) for node, _ in node.value]
vals = [loader.construct_object(node, deep=deep) for _, node in node.value]
key_count = Counter(keys)
data = defaultdict(dict) # map all data removing duplicates
c = 0
for key, value in zip(keys, vals):
if key_count[key] > 1:
data[f'{key}{c}'] = value
c += 1
else:
data[key] = value
return data
PreserveDuplicatesLoader.add_constructor(yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
map_constructor)
return yaml.load(input_file, PreserveDuplicatesLoader)
##########################################################
with open(inputf, 'r') as file:
fp = parse_preserving_duplicates(input_file)

Human readable iterables in Sphinx documentation

Sphinx-autodoc flattens dicts, lists, and tuples - making long ones barely readable. Pretty-print format isn't always desired either, as some nested containers are better kept flattened than columned. Is there a way to display iterables as typed in source code?
Get it straight from source, and add an .rst command for it:
# conf.py
from importlib import import_module
from docutils import nodes
from sphinx import addnodes
from inspect import getsource
from docutils.parsers.rst import Directive
class PrettyPrintIterable(Directive):
required_arguments = 1
def run(self):
def _get_iter_source(src, varname):
# 1. identifies target iterable by variable name, (cannot be spaced)
# 2. determines iter source code start & end by tracking brackets
# 3. returns source code between found start & end
start = end = None
open_brackets = closed_brackets = 0
for i, line in enumerate(src):
if line.startswith(varname):
if start is None:
start = i
if start is not None:
open_brackets += sum(line.count(b) for b in "([{")
closed_brackets += sum(line.count(b) for b in ")]}")
if open_brackets > 0 and (open_brackets - closed_brackets == 0):
end = i + 1
break
return '\n'.join(src[start:end])
module_path, member_name = self.arguments[0].rsplit('.', 1)
src = getsource(import_module(module_path)).split('\n')
code = _get_iter_source(src, member_name)
literal = nodes.literal_block(code, code)
literal['language'] = 'python'
return [addnodes.desc_name(text=member_name),
addnodes.desc_content('', literal)]
def setup(app):
app.add_directive('pprint', PrettyPrintIterable)
Example .rst and result:
(:autodata: with empty :annotation: is to exclude the original flattened dictionary).
Some code borrowed from this answer.

How to decoding IFC using Ruby

In Ruby, I'm reading an .ifc file to get some information, but I can't decode it. For example, the file content:
"'S\X2\00E9\X0\jour/Cuisine'"
should be:
"'Séjour/Cuisine'"
I'm trying to encode it with:
puts ifcFileLine.encode("Windows-1252")
puts ifcFileLine.encode("ISO-8859-1")
puts ifcFileLine.encode("ISO-8859-5")
puts ifcFileLine.encode("iso-8859-1").force_encoding("utf-8")'
But nothing gives me what I need.
I don't know anything about IFC, but based solely on the page Denis linked to and your example input, this works:
ESCAPE_SEQUENCE_EXPR = /\\X2\\(.*?)\\X0\\/
def decode_ifc(str)
str.gsub(ESCAPE_SEQUENCE_EXPR) do
$1.gsub(/..../) { $&.to_i(16).chr(Encoding::UTF_8) }
end
end
str = 'S\X2\00E9\X0\jour/Cuisine'
puts "Input:", str
puts "Output:", decode_ifc(str)
All this code does is replace every sequence of four characters (/..../) between the delimiters, which will each be a Unicode code point in hexadecimal, with the corresponding Unicode character.
Note that this code handles only this specific encoding. A quick glance at the implementation guide shows other encodings, including an \X4 directive for Unicode characters outside the Basic Multilingual Plane. This ought to get you started, though.
See it on eval.in: https://eval.in/776980
If someone is interested, I wrote here a Python Code that decode 3 of the IFC encodings : \X, \X2\ and \S\
import re
def decodeIfc(txt):
# In regex "\" is hard to manage in Python... I use this workaround
txt = txt.replace('\\', 'µµµ')
txt = re.sub('µµµX2µµµ([0-9A-F]{4,})+µµµX0µµµ', decodeIfcX2, txt)
txt = re.sub('µµµSµµµ(.)', decodeIfcS, txt)
txt = re.sub('µµµXµµµ([0-9A-F]{2})', decodeIfcX, txt)
txt = txt.replace('µµµ','\\')
return txt
def decodeIfcX2(match):
# X2 encodes characters with multiple of 4 hexadecimal numbers.
return ''.join(list(map(lambda x : chr(int(x,16)), re.findall('([0-9A-F]{4})',match.group(1)))))
def decodeIfcS(match):
return chr(ord(match.group(1))+128)
def decodeIfcX(match):
# Sometimes, IFC files were made with old Mac... wich use MacRoman encoding.
num = int(match.group(1), 16)
if (num <= 127) | (num >= 160):
return chr(num)
else:
return bytes.fromhex(match.group(1)).decode("macroman")

What is the better logic for identifying the latest versioned files from a big list

I have 200 images in a folder and each file may contain several versions (example: car_image#2, car_image#2, bike_image#2, etc ). My requirement is to build a utility to copy all the latest files from this directory to another.
My approach is:
Put the imagesNames (without containing version numbers) into a list
Eliminate the duplicates from the list
Iterate through the list and identify the latest version of each unique file (I am little blurred on this step)
Can someone throw some better ideas/algorithm to achieve this?
My approach would be:
Make a list of unique names by getting each filename up to the #, only adding unique values.
Make a dictionary with filenames as keys, and set values to be the version number, updating when it's larger than the one stored.
Go through the dictionary and produce the filenames to grab.
My go-to would be a python script but you should be able to do this in pretty much whatever language you find suitable.
Ex code for getting the filename list:
#get the filename list
myList = []
for x in file_directory:
fname = x.split("#")[0]
if not fname in myList:
myList = myList + [fname]
myDict = {}
for x in myList:
if not x in myDict:
myDict[x] = 0
for x in file_directory:
fversion = x.split("#")[-1]
if myDict[x] < int(fversion):
myDict[x] = fversion
flist = []
for x in myDict:
fname = str(x) + "#" + str(myDict[x])
flist.append(fname)
Then flist would be a list of filenames of the most recent versions
I didn't run this or anything but hopefully it helps!
In Python 3
>>> images = sorted(set(sum([['%s_image#%i' % (nm, random.randint(1,9)) for i in range(random.randint(2,5))] for nm in 'car bike cat dog man tree'.split()], [])))
>>> print('\n'.join(images))
bike_image#2
bike_image#3
bike_image#4
bike_image#5
car_image#2
car_image#7
cat_image#3
dog_image#2
dog_image#5
dog_image#9
man_image#1
man_image#2
man_image#4
man_image#6
man_image#7
tree_image#3
tree_image#4
>>> from collections import defaultdict
>>> image2max = defaultdict(int)
>>> for image in images:
name, _, version = image.partition('#')
version = int(version)
if version > image2max[name]:
image2max[name] = version
>>> # Max version
>>> for image in sorted(image2max):
print('%s#%i' % (image, image2max[image]))
bike_image#5
car_image#7
cat_image#3
dog_image#9
man_image#7
tree_image#4
>>>

How to automatically turn BibTex citation into something parseable by Zotero?

I have a citation system which publishes users notes to a wiki (Researchr). Programmatically, I have access to the full BibTeX record of each entry, and I also display this on the individual pages (for example - click on BibTeX). This is in the interest of making it easy for users of other citation manager to automatically import the citation of a paper that interests them. I would also like other citation managers, especially Zotero, to be able to automatically detect and import a citation.
Zotero lists a number of ways of exposing metadata that it will understand, including meta tags with RDF, COiNS, Dublin Core and unAPI. Is there a Ruby library for converting BibTeX to any of these standards automatically - or a Javascript library? I could probably create something, but if something existed, it would be far more robust (BibTeX has so many publication types and fields etc).
There's a BibTeX2RDF convertor available here, might be what you're after.
unAPI is not a data standard - it's a way to serve data (to Zotero and other programs). Zotero imports Bibtex, so serving Bibtex via unAPI works just fine. Inspire is an example of a site that does that:
http://inspirehep.net/
By now one can simply import bibtex files of type .bib directly in Zotero. However, I noticed my bibtex files were often less complete than Zotero (in particular they often missed a DOI), and I did not find an "auto-complete" function (based on the data in the bibtex entries) in Zotero.
So I import the .bib file with Zotero, to ensure they are all in there. Then I run a python script that gets all the missing DOI's it can find for the entries in that .bib file, and exports them to a space separated .txt file.:
# pip install habanero
from habanero import Crossref
import re
def titletodoi(keyword):
cr = Crossref()
result = cr.works(query=keyword)
items = result["message"]["items"]
item_title = items[0]["title"]
tmp = ""
for it in item_title:
tmp += it
title = keyword.replace(" ", "").lower()
title = re.sub(r"\W", "", title)
# print('title: ' + title)
tmp = tmp.replace(" ", "").lower()
tmp = re.sub(r"\W", "", tmp)
# print('tmp: ' + tmp)
if title == tmp:
doi = items[0]["DOI"]
return doi
else:
return None
def get_dois(titles):
dois = []
for title in titles:
try:
doi = titletodoi(title)
print(f"doi={doi}, title={title}")
if not doi is None:
dois.append(doi)
except:
pass
# print("An exception occurred")
print(f"dois={dois}")
return dois
def read_titles_from_file(filepath):
with open(filepath) as f:
lines = f.read().splitlines()
split_lines = splits_lines(lines)
return split_lines
def splits_lines(lines):
split_lines = []
for line in lines:
new_lines = line.split(";")
for new_line in new_lines:
split_lines.append(new_line)
return split_lines
def write_dois_to_file(dois, filename, separation_char):
textfile = open(filename, "w")
for doi in dois:
textfile.write(doi + separation_char)
textfile.close()
filepath = "list_of_titles.txt"
titles = read_titles_from_file(filepath)
dois = get_dois(titles)
write_dois_to_file(dois, "dois_space.txt", " ")
write_dois_to_file(dois, "dois_per_line.txt", "\n")
The DOIs of the .txt are fed into magic wand of Zotero. Next, I (manually) remove the duplicates by choosing the latest added entry (because that comes from the magic wand with the most data).
After that, I run another script to update all the reference id's in my .tex and .bib files to those generated by Zotero:
# Importing library
import bibtexparser
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import *
import os, fnmatch
import Levenshtein as lev
# Let's define a function to customize our entries.
# It takes a record and return this record.
def customizations(record):
"""Use some functions delivered by the library
:param record: a record
:returns: -- customized record
"""
record = type(record)
record = author(record)
record = editor(record)
record = journal(record)
record = keyword(record)
record = link(record)
record = page_double_hyphen(record)
record = doi(record)
return record
def get_references(filepath):
with open(filepath) as bibtex_file:
parser = BibTexParser()
parser.customization = customizations
bib_database = bibtexparser.load(bibtex_file, parser=parser)
# print(bib_database.entries)
return bib_database
def get_reference_mapping(main_filepath, sub_filepath):
found_sub = []
found_main = []
main_into_sub = []
main_references = get_references(main_filepath)
sub_references = get_references(sub_filepath)
for main_entry in main_references.entries:
for sub_entry in sub_references.entries:
# Match the reference ID if 85% similair titles are detected
lev_ratio = lev.ratio(
remove_curly_braces(main_entry["title"]).lower(),
remove_curly_braces(sub_entry["title"]).lower(),
)
if lev_ratio > 0.85:
print(f"lev_ratio={lev_ratio}")
if main_entry["ID"] != sub_entry["ID"]:
print(f'replace: {sub_entry["ID"]} with: {main_entry["ID"]}')
main_into_sub.append([main_entry, sub_entry])
# Keep track of which entries have been found
found_sub.append(sub_entry)
found_main.append(main_entry)
return (
main_into_sub,
found_main,
found_sub,
main_references.entries,
sub_references.entries,
)
def remove_curly_braces(string):
left = string.replace("{", "")
right = left.replace("{", "")
return right
def replace_references(main_into_sub, directory):
for pair in main_into_sub:
main = pair[0]["ID"]
sub = pair[1]["ID"]
print(f"replace: {sub} with: {main}")
# UNCOMMENT IF YOU WANT TO ACTUALLY DO THE PRINTED REPLACEMENT
# findReplace(latex_root_dir, sub, main, "*.tex")
# findReplace(latex_root_dir, sub, main, "*.bib")
def findReplace(directory, find, replace, filePattern):
for path, dirs, files in os.walk(os.path.abspath(directory)):
for filename in fnmatch.filter(files, filePattern):
filepath = os.path.join(path, filename)
with open(filepath) as f:
s = f.read()
s = s.replace(find, replace)
with open(filepath, "w") as f:
f.write(s)
def list_missing(main_references, sub_references):
for sub in sub_references:
if not sub["ID"] in list(map(lambda x: x["ID"], main_references)):
print(f'the following reference has a changed title:{sub["ID"]}')
latex_root_dir = "some_path/"
main_filepath = f"{latex_root_dir}latex/Literature_study/zotero.bib"
sub_filepath = f"{latex_root_dir}latex/Literature_study/references.bib"
(
main_into_sub,
found_main,
found_sub,
main_references,
sub_references,
) = get_reference_mapping(main_filepath, sub_filepath)
replace_references(main_into_sub, latex_root_dir)
list_missing(main_references, sub_references)
# For those references which have levenshtein ratio below 85 you can specify a manual swap:
manual_swap = [] # main into sub
# manual_swap.append(["cantley_impact_2021","cantley2021impact"])
# manual_swap.append(["widemann_envision_2021","widemann2020envision"])
for pair in manual_swap:
main = pair[0]
sub = pair[1]
print(f"replace: {sub} with: {main}")
# UNCOMMENT IF YOU WANT TO ACTUALLY DO THE PRINTED REPLACEMENT
# findReplace(latex_root_dir, sub, main, "*.tex")
# findReplace(latex_root_dir, sub, main, "*.bib")

Resources