How to automatically turn BibTex citation into something parseable by Zotero? - ruby

I have a citation system which publishes users notes to a wiki (Researchr). Programmatically, I have access to the full BibTeX record of each entry, and I also display this on the individual pages (for example - click on BibTeX). This is in the interest of making it easy for users of other citation manager to automatically import the citation of a paper that interests them. I would also like other citation managers, especially Zotero, to be able to automatically detect and import a citation.
Zotero lists a number of ways of exposing metadata that it will understand, including meta tags with RDF, COiNS, Dublin Core and unAPI. Is there a Ruby library for converting BibTeX to any of these standards automatically - or a Javascript library? I could probably create something, but if something existed, it would be far more robust (BibTeX has so many publication types and fields etc).

There's a BibTeX2RDF convertor available here, might be what you're after.

unAPI is not a data standard - it's a way to serve data (to Zotero and other programs). Zotero imports Bibtex, so serving Bibtex via unAPI works just fine. Inspire is an example of a site that does that:
http://inspirehep.net/

By now one can simply import bibtex files of type .bib directly in Zotero. However, I noticed my bibtex files were often less complete than Zotero (in particular they often missed a DOI), and I did not find an "auto-complete" function (based on the data in the bibtex entries) in Zotero.
So I import the .bib file with Zotero, to ensure they are all in there. Then I run a python script that gets all the missing DOI's it can find for the entries in that .bib file, and exports them to a space separated .txt file.:
# pip install habanero
from habanero import Crossref
import re
def titletodoi(keyword):
cr = Crossref()
result = cr.works(query=keyword)
items = result["message"]["items"]
item_title = items[0]["title"]
tmp = ""
for it in item_title:
tmp += it
title = keyword.replace(" ", "").lower()
title = re.sub(r"\W", "", title)
# print('title: ' + title)
tmp = tmp.replace(" ", "").lower()
tmp = re.sub(r"\W", "", tmp)
# print('tmp: ' + tmp)
if title == tmp:
doi = items[0]["DOI"]
return doi
else:
return None
def get_dois(titles):
dois = []
for title in titles:
try:
doi = titletodoi(title)
print(f"doi={doi}, title={title}")
if not doi is None:
dois.append(doi)
except:
pass
# print("An exception occurred")
print(f"dois={dois}")
return dois
def read_titles_from_file(filepath):
with open(filepath) as f:
lines = f.read().splitlines()
split_lines = splits_lines(lines)
return split_lines
def splits_lines(lines):
split_lines = []
for line in lines:
new_lines = line.split(";")
for new_line in new_lines:
split_lines.append(new_line)
return split_lines
def write_dois_to_file(dois, filename, separation_char):
textfile = open(filename, "w")
for doi in dois:
textfile.write(doi + separation_char)
textfile.close()
filepath = "list_of_titles.txt"
titles = read_titles_from_file(filepath)
dois = get_dois(titles)
write_dois_to_file(dois, "dois_space.txt", " ")
write_dois_to_file(dois, "dois_per_line.txt", "\n")
The DOIs of the .txt are fed into magic wand of Zotero. Next, I (manually) remove the duplicates by choosing the latest added entry (because that comes from the magic wand with the most data).
After that, I run another script to update all the reference id's in my .tex and .bib files to those generated by Zotero:
# Importing library
import bibtexparser
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import *
import os, fnmatch
import Levenshtein as lev
# Let's define a function to customize our entries.
# It takes a record and return this record.
def customizations(record):
"""Use some functions delivered by the library
:param record: a record
:returns: -- customized record
"""
record = type(record)
record = author(record)
record = editor(record)
record = journal(record)
record = keyword(record)
record = link(record)
record = page_double_hyphen(record)
record = doi(record)
return record
def get_references(filepath):
with open(filepath) as bibtex_file:
parser = BibTexParser()
parser.customization = customizations
bib_database = bibtexparser.load(bibtex_file, parser=parser)
# print(bib_database.entries)
return bib_database
def get_reference_mapping(main_filepath, sub_filepath):
found_sub = []
found_main = []
main_into_sub = []
main_references = get_references(main_filepath)
sub_references = get_references(sub_filepath)
for main_entry in main_references.entries:
for sub_entry in sub_references.entries:
# Match the reference ID if 85% similair titles are detected
lev_ratio = lev.ratio(
remove_curly_braces(main_entry["title"]).lower(),
remove_curly_braces(sub_entry["title"]).lower(),
)
if lev_ratio > 0.85:
print(f"lev_ratio={lev_ratio}")
if main_entry["ID"] != sub_entry["ID"]:
print(f'replace: {sub_entry["ID"]} with: {main_entry["ID"]}')
main_into_sub.append([main_entry, sub_entry])
# Keep track of which entries have been found
found_sub.append(sub_entry)
found_main.append(main_entry)
return (
main_into_sub,
found_main,
found_sub,
main_references.entries,
sub_references.entries,
)
def remove_curly_braces(string):
left = string.replace("{", "")
right = left.replace("{", "")
return right
def replace_references(main_into_sub, directory):
for pair in main_into_sub:
main = pair[0]["ID"]
sub = pair[1]["ID"]
print(f"replace: {sub} with: {main}")
# UNCOMMENT IF YOU WANT TO ACTUALLY DO THE PRINTED REPLACEMENT
# findReplace(latex_root_dir, sub, main, "*.tex")
# findReplace(latex_root_dir, sub, main, "*.bib")
def findReplace(directory, find, replace, filePattern):
for path, dirs, files in os.walk(os.path.abspath(directory)):
for filename in fnmatch.filter(files, filePattern):
filepath = os.path.join(path, filename)
with open(filepath) as f:
s = f.read()
s = s.replace(find, replace)
with open(filepath, "w") as f:
f.write(s)
def list_missing(main_references, sub_references):
for sub in sub_references:
if not sub["ID"] in list(map(lambda x: x["ID"], main_references)):
print(f'the following reference has a changed title:{sub["ID"]}')
latex_root_dir = "some_path/"
main_filepath = f"{latex_root_dir}latex/Literature_study/zotero.bib"
sub_filepath = f"{latex_root_dir}latex/Literature_study/references.bib"
(
main_into_sub,
found_main,
found_sub,
main_references,
sub_references,
) = get_reference_mapping(main_filepath, sub_filepath)
replace_references(main_into_sub, latex_root_dir)
list_missing(main_references, sub_references)
# For those references which have levenshtein ratio below 85 you can specify a manual swap:
manual_swap = [] # main into sub
# manual_swap.append(["cantley_impact_2021","cantley2021impact"])
# manual_swap.append(["widemann_envision_2021","widemann2020envision"])
for pair in manual_swap:
main = pair[0]
sub = pair[1]
print(f"replace: {sub} with: {main}")
# UNCOMMENT IF YOU WANT TO ACTUALLY DO THE PRINTED REPLACEMENT
# findReplace(latex_root_dir, sub, main, "*.tex")
# findReplace(latex_root_dir, sub, main, "*.bib")

Related

PyYAML loader with duplicate keys

Using PyYAML for loading a YAML (large) file which has duplicate keys. I would like to preserve all keys and would modify duplicate key according to project need. But it seems PyYAML is silently overwrites results with the last key and not getting a chance to modify it as my need (loss of information), resulting in this dict: {'blocks':{'a':'b2:11 c2:22'}}
simple example YAML:
import yaml
given_str = '''
blocks:
a:
b1:1
c1:2
a:
b2:11
c2:22'''
p = yaml.load(given_str)
How can I load the YAML with duplicate keys so that I get a chance to recursively traverse it and modify keys as my need. I need to load YAML and then transfer it into a database.
Assuming your input YAML has no merge keys ('<<'), no tags and no comments you want
to preserve, you can use the following:
import sys
import ruamel.yaml
from pathlib import Path
from collections.abc import Hashable
file_in = Path('input.yaml')
class MyConstructor(ruamel.yaml.constructor.SafeConstructor):
def construct_mapping(self, node, deep=False):
"""deep is True when creating an object/mapping recursively,
in that case want the underlying elements available during construction
"""
if not isinstance(node, ruamel.yaml.nodes.MappingNode):
raise ConstructorError(
None, None, f'expected a mapping node, but found {node.id!s}', node.start_mark,
)
total_mapping = self.yaml_base_dict_type()
if getattr(node, 'merge', None) is not None:
todo = [(node.merge, False), (node.value, False)]
else:
todo = [(node.value, True)]
for values, check in todo:
mapping: Dict[Any, Any] = self.yaml_base_dict_type()
for key_node, value_node in values:
# keys can be list -> deep
key = self.construct_object(key_node, deep=True)
# lists are not hashable, but tuples are
if not isinstance(key, Hashable):
if isinstance(key, list):
key = tuple(key)
if not isinstance(key, Hashable):
raise ConstructorError(
'while constructing a mapping',
node.start_mark,
'found unhashable key',
key_node.start_mark,
)
value = self.construct_object(value_node, deep=deep)
if key in mapping:
pat = key + '_undup_{}'
index = 0
while True:
nkey = pat.format(index)
if nkey not in mapping:
key = nkey
break
index += 1
mapping[key] = value
total_mapping.update(mapping)
return total_mapping
yaml = ruamel.yaml.YAML(typ='safe')
yaml.default_flow_style = False
yaml.Constructor = MyConstructor
data = yaml.load(file_in)
yaml.dump(data, sys.stdout)
which gives:
blocks:
a: b1:1 c1:2
a_undup_0: b2:11 c2:22
Please note that the values for both a keys are multiline plain scalars. For b1 and c1 to be a key
the mapping value indicator (:, the colon) needs to be followed by a whitespace character:
a:
b1: 1
c1: 2
After reading many forums, I think best solution is create a wrapper for yml loader (removing duplicates) is the solution. #Anthon - any comment?
import yaml
from collections import defaultdict, Counter
####### Preserving Duplicate ###################
def parse_preserving_duplicates(input_file):
class PreserveDuplicatesLoader(yaml.CLoader):
pass
def map_constructor(loader, node, deep=False):
"""Walk tree, removing degeneracy in any duplicate keys"""
keys = [loader.construct_object(node, deep=deep) for node, _ in node.value]
vals = [loader.construct_object(node, deep=deep) for _, node in node.value]
key_count = Counter(keys)
data = defaultdict(dict) # map all data removing duplicates
c = 0
for key, value in zip(keys, vals):
if key_count[key] > 1:
data[f'{key}{c}'] = value
c += 1
else:
data[key] = value
return data
PreserveDuplicatesLoader.add_constructor(yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
map_constructor)
return yaml.load(input_file, PreserveDuplicatesLoader)
##########################################################
with open(inputf, 'r') as file:
fp = parse_preserving_duplicates(input_file)

PYsimpleGUI create a listbox of folders

I am trying to modify the demoprogram from PYsimpleGUI (Browser_START_HERE_Demo_program_Browser.py) to:
manually select a main folder
list all the subfolders in that folder (but not the files inside them)
make it possible to select a few of those folders, and list them as an output.
I thought I'd do so by editting the code for getting the file list dic, but everything I tried, just makes it
Any ideas? I attached it:
'''def get_file_list_dict():
"""
Returns dictionary of files
Key is short filename
Value is the full filename and path
:return: Dictionary of demo files
:rtype: Dict[str:str]
"""
demo_path = get_demo_path()
demo_files_dict = {}
for dirname, dirnames, filenames in os.walk(demo_path):
for filename in filenames:
if filename.endswith('.py') or filename.endswith('.pyw'):
fname_full = os.path.join(dirname, filename)
if filename not in demo_files_dict.keys():
demo_files_dict[filename] = fname_full
else:
# Allow up to 100 dupicated names. After that, give up
for i in range(1, 100):
new_filename = f'{filename}_{i}'
if new_filename not in demo_files_dict:
demo_files_dict[new_filename] = fname_full
break
return demo_files_dict'''
It's much difficult for me to modify code of Browser_START_HERE_Demo_program_Browser.py to my requirements.
Assume the target is
Select a main directory by a button to call function sg.popup_get_folder
List all subdirectories under main directory in one sg.Listbox
Subdirectories selected shown in another sg.Listbox as output when click Add button
Example Code
from pathlib import Path
import PySimpleGUI as sg
font = ("Courier New", 11)
sg.theme("Dark")
sg.set_options(font=font)
subfolders = []
selected = []
frame_subholders = [[sg.Listbox(subfolders, size=(80, 10), key='Subfolders',
select_mode=sg.LISTBOX_SELECT_MODE_EXTENDED, enable_events=True,
highlight_background_color='blue', highlight_text_color='white')]]
frame_selected = [[sg.Listbox(selected, size=(80, 10), key='Selected')]]
layout = [
[sg.Input(readonly=True, expand_x=True, key='Main',
disabled_readonly_background_color=sg.theme_input_background_color()),
sg.Button("Main Folder")],
[sg.Frame("Subholder", frame_subholders)],
[sg.Frame("Selected subholder", frame_selected)],
[sg.Button('Add')],
]
window = sg.Window('Title', layout, finalize=True)
entry = window['Main'].Widget
input_size = entry.winfo_width()//sg.Text.char_width_in_pixels(font)
print(input_size)
while True:
event, values = window.read()
if event == sg.WINDOW_CLOSED:
break
elif event == 'Main Folder':
main_folder = sg.popup_get_folder("", no_window=True)
if main_folder and Path(main_folder).is_dir():
main_folder = main_folder.replace("/", '\\') # For Windows
half = input_size//2
text = main_folder if len(main_folder) <= input_size else main_folder[:half-3]+"..."+main_folder[-half:]
window['Main'].update(text)
subfolders = sorted([str(f) for f in Path(main_folder).iterdir() if f.is_dir()])
window['Subfolders'].update(values=subfolders)
selected = []
window['Selected'].update(values=selected)
elif event == 'Add':
selected = sorted([path for path in values['Subfolders']])
window['Selected'].update(values=selected)
window.close()

how to add symbols to the multiple stock data

#i have scraped data below is my code, now i want to add a column of symbols to the respective company data, plz guide me how the symbol can be added to the respective firm data
#code below
from time import sleep
import pandas as pd
import os
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
browser = webdriver.Chrome(ChromeDriverManager().install())
symbols =['FATIMA',
'SSGC',
'FCCL',
'ISL',
'KEL',
'NCL',
'DGKC',
'SNGP',
'NML',
'ENGRO',
'HUMNL',
'CHCC',
'ATRL',
'HUBC',
'ASTL',
'PIBTL',
'OGDC',
'EFERT',
'FFC',
'NCPL',
'KTML',
'PSO',
'LUCK',
'SEARL',
'KOHC',
'ABOT',
'AICL',
'HASCOL',
'PTC',
'KAPCO',
'PIOC',
'POL',
'SHEL',
'GHGL',
'HCAR',
'DCR',
'BWCL',
'MTL',
'GLAXO',
'PKGS',
'SHFA','MARI',
'ICI',
'ACPL',
'PSMC',
'SPWL',
'THALL',
'BNWM',
'EFUG',
'GADT',
'AABS']
company = 1
for ThisSymbol in symbols :
# Get first symbol from the above python list
company = 2
# In the URL, make symbol as variable
url = 'http://www.scstrade.com/stockscreening/SS_CompanySnapShotYF.aspx?symbol=' + ThisSymbol
browser.get(url)
sleep(2)
# The below command will get all the contents from the url
html = browser.execute_script("return document.documentElement.outerHTML")
# So we will supply the contents to beautiful soup and we tell to consider this text as a html, with the following command
soup = BeautifulSoup (html, "html.parser")
for rn in range(0,9) :
plist = []
r = soup.find_all('tr')[rn]
# Condition: if first row, then th, otherwise td
if (rn==0) :
celltag = 'th'
else :
celltag = 'td'
# Now use the celltag instead of using fixed td or th
col = r.find_all(celltag)
print()
if col[i] == 0:
print ("")
else:
for i in range(0,4) :
cell = col[i].text
clean = cell.replace('\xa0 ', '')
clean = clean.replace (' ', '')
plist.append(clean)
# If first row, create df, otherwise add to it
if (rn == 0) :
df = pd.DataFrame(plist)
else :
df2 = pd.DataFrame(plist)
colname = 'y' + str(2019-rn)
df[colname] = df2
if (company == 1):
dft = df.T
# Get header Column
head = dft.iloc[0]
# Exclude first row from the data
dft = dft[1:]
dft.columns = head
dft = dft.reset_index()
# Assign Headers
dft = dft.drop(['index'], axis = 'columns')
else:
dft2 = df.T
# Get header Column
head = dft2.iloc[0]
# Exclude first row from the data
dft2 = dft2[1:]
dft2.columns = head
dft2 = dft2.reset_index()
# Assign Headers
dft2 = dft2.drop(['index'], axis = 'columns')
dft['Symbol'] = ThisSymbol
dft = dft.append(dft2, sort=['Year','Symbol'])
company = company +1
dft
my output looks this, i want to have a symbol column to each respective firm data
Symbol,i have added
dft['Symbol'] = ThisSymbol
but it add just first company from the list to all companies data
enter image description here

Human readable iterables in Sphinx documentation

Sphinx-autodoc flattens dicts, lists, and tuples - making long ones barely readable. Pretty-print format isn't always desired either, as some nested containers are better kept flattened than columned. Is there a way to display iterables as typed in source code?
Get it straight from source, and add an .rst command for it:
# conf.py
from importlib import import_module
from docutils import nodes
from sphinx import addnodes
from inspect import getsource
from docutils.parsers.rst import Directive
class PrettyPrintIterable(Directive):
required_arguments = 1
def run(self):
def _get_iter_source(src, varname):
# 1. identifies target iterable by variable name, (cannot be spaced)
# 2. determines iter source code start & end by tracking brackets
# 3. returns source code between found start & end
start = end = None
open_brackets = closed_brackets = 0
for i, line in enumerate(src):
if line.startswith(varname):
if start is None:
start = i
if start is not None:
open_brackets += sum(line.count(b) for b in "([{")
closed_brackets += sum(line.count(b) for b in ")]}")
if open_brackets > 0 and (open_brackets - closed_brackets == 0):
end = i + 1
break
return '\n'.join(src[start:end])
module_path, member_name = self.arguments[0].rsplit('.', 1)
src = getsource(import_module(module_path)).split('\n')
code = _get_iter_source(src, member_name)
literal = nodes.literal_block(code, code)
literal['language'] = 'python'
return [addnodes.desc_name(text=member_name),
addnodes.desc_content('', literal)]
def setup(app):
app.add_directive('pprint', PrettyPrintIterable)
Example .rst and result:
(:autodata: with empty :annotation: is to exclude the original flattened dictionary).
Some code borrowed from this answer.

Extract keywords in a pystache template with gettext

I have html-template in my project that they are contain pystache codes such as
{{#_}}Word{{\_}}
I want to know , how I can extract this words by PoEditor parsers
You could use a regular expression to get them, and then remove what you don't want:
import re
regex=re.compile("\{\{\#\_\}\}.+\{\\\_\}\} ")
words=re.findall(regex, data)
#To remove it use re.split or simply now searching for [A-Z].
now I just using this code for making a file for using in PoEdit for scan and extracting the keywords:
def makeTempLang():
fs = getFiles('templates/')
words = []
regex =re.compile("\{\{\#\_\}\}(.+)\{\{/\_\}\}")
for f in fs:
data=open(f,'r').read()
fwords=re.findall(regex, data)
words.extend(fwords)
clean = (words[4:])
data='from core import config\n_=config.i18n\n'
for c in clean:
data = "%s_('%s')\n"%(data,c)
open('locale/temp2.py','w+').write(data)
pass
def getFiles(spath=''):
res =[]
arr = os.listdir(spath)
for d in arr:
dpath =os.path.join(spath,d)
if d.endswith('.htm'):
res.append(dpath)
if os.path.isdir(dpath):
sub=getFiles(dpath)
if len(sub) > 0 :
res.extend(sub)
return res

Resources