Apply a Quartz filter while saving PDF under Mac OS X 10.6.3 - macos

Using Mac OS X API, I'm trying to save a PDF file with a Quartz filter applied, just like it is possible from the "Save As" dialog in the Preview application. So far I've written the following code (using Python and pyObjC, but it isn't important for me):
-- filter-pdf.py: begin
from Foundation import *
from Quartz import *
import objc
page_rect = CGRectMake (0, 0, 612, 792)
fdict = NSDictionary.dictionaryWithContentsOfFile_("/System/Library/Filters/Blue
\ Tone.qfilter")
in_pdf = CGPDFDocumentCreateWithProvider(CGDataProviderCreateWithFilename ("test
.pdf"))
url = CFURLCreateWithFileSystemPath(None, "test_out.pdf", kCFURLPOSIXPathStyle,
False)
c = CGPDFContextCreateWithURL(url, page_rect, fdict)
np = CGPDFDocumentGetNumberOfPages(in_pdf)
for ip in range (1, np+1):
page = CGPDFDocumentGetPage(in_pdf, ip)
r = CGPDFPageGetBoxRect(page, kCGPDFMediaBox)
CGContextBeginPage(c, r)
CGContextDrawPDFPage(c, page)
CGContextEndPage(c)
-- filter-pdf.py: end
Unfortunalte, the filter "Blue Tone" isn't applied, the output PDF looks exactly as the input PDF.
Question: what I missed? How to apply a filter?
Well, the documentation doesn't promise that such way of creating and using "fdict" should cause that the filter is applied. But I just rewritten (as far as I can) sample code /Developer/Examples/Quartz/Python/filter-pdf.py, which was distributed with older versions of Mac (meanwhile, this code doesn't work too):
----- filter-pdf-old.py: begin
from CoreGraphics import *
import sys, os, math, getopt, string
def usage ():
print '''
usage: python filter-pdf.py FILTER INPUT-PDF OUTPUT-PDF
Apply a ColorSync Filter to a PDF document.
'''
def main ():
page_rect = CGRectMake (0, 0, 612, 792)
try:
opts,args = getopt.getopt (sys.argv[1:], '', [])
except getopt.GetoptError:
usage ()
sys.exit (1)
if len (args) != 3:
usage ()
sys.exit (1)
filter = CGContextFilterCreateDictionary (args[0])
if not filter:
print 'Unable to create context filter'
sys.exit (1)
pdf = CGPDFDocumentCreateWithProvider (CGDataProviderCreateWithFilename (args[1]))
if not pdf:
print 'Unable to open input file'
sys.exit (1)
c = CGPDFContextCreateWithFilename (args[2], page_rect, filter)
if not c:
print 'Unable to create output context'
sys.exit (1)
for p in range (1, pdf.getNumberOfPages () + 1):
#r = pdf.getMediaBox (p)
r = pdf.getPage(p).getBoxRect(p)
c.beginPage (r)
c.drawPDFDocument (r, pdf, p)
c.endPage ()
c.finish ()
if __name__ == '__main__':
main ()
----- filter-pdf-old.py: end
=======================================================================
The working code based on the answer:
from Foundation import *
from Quartz import *
pdf_url = NSURL.fileURLWithPath_("test.pdf")
pdf_doc = PDFDocument.alloc().initWithURL_(pdf_url)
furl = NSURL.fileURLWithPath_("/System/Library/Filters/Blue Tone.qfilter")
fobj = QuartzFilter.quartzFilterWithURL_(furl)
fdict = { 'QuartzFilter': fobj }
pdf_doc.writeToFile_withOptions_("test_out.pdf", fdict)

two approaches - if you need to open and modify an already existing file, use the PDFKit's PDFDocument (reference) and use PDFDocument's writeToFile_withOptions_ with option dict including the "QuartzFilter" option of needed filter.
OTOH if you need your own drawing and have a CGContext at hand, you can use something along these lines:
from Quartz import *
data = NSMutableData.dataWithCapacity_(1024**2)
dataConsumer = CGDataConsumerCreateWithCFData(data)
context = CGPDFContextCreate(dataConsumer, None, None)
f = QuartzFilter.quartzFilterWithURL_(NSURL.fileURLWithPath_("YourFltr.qfilter"))
f.applyToContext_(context)
# do your drawing
CGPDFContextClose(context)
# the PDF is in the data variable. Do whatever you need to do with the data (save to file...).

Related

pystray on MacOS to use run_detached, program crashed

I am trying to use pystray to create a icon on tasktray, it is working on windows but now I am building one for Mac. I need the program minimize to tasktray on run on background. so I need to use icon.run_detached() instead of icon.run().
However, it keep crashing the app and I read the documents seems that I need to give some darwin_nsapplication = AppKit.NSApplication.sharedApplication() to the code but I really don't know how to implement this. here is my code.
import tkinter as tk
import time
import pystray
from tkinter import *
from tkinter import messagebox
from PIL import Image
import AppKit
`class Gui():
def __init__(self):
self.window = tk.Tk()
self.darwin_nsapplication = AppKit.NSApplication.sharedApplication()
self.image = Image.open("./images/noname.png")
self.menu = (
pystray.MenuItem('Show', self.show_window),
pystray.MenuItem('Quit', self.quit_window)
)
# Declaration of variables
self.hour=StringVar()
self.minute=StringVar()
self.second=StringVar()
# setting the default value as 0
self.hour.set("00")
self.minute.set("00")
self.second.set("00")
# Use of Entry class to take input from the user
hourEntry= Entry(self.window, width=3, font=("Arial",18,""),
textvariable=self.hour)
hourEntry.place(x=80,y=20)
minuteEntry= Entry(self.window, width=3, font=("Arial",18,""),
textvariable=self.minute)
minuteEntry.place(x=130,y=20)
secondEntry= Entry(self.window, width=3, font=("Arial",18,""),
textvariable=self.second)
secondEntry.place(x=180,y=20)
# button widget
btn = Button(self.window, text='Set Time Countdown', bd='5',
command= self.submit)
btn.place(x = 70,y = 120)
def submit(self):
try:
# the input provided by the user is
# stored in here :temp
temp = int(self.hour.get())*3600 + int(self.minute.get())*60 + int(self.second.get())
except:
print("Please input the right value")
while temp >-1:
# divmod(firstvalue = temp//60, secondvalue = temp%60)
mins,secs = divmod(temp,60)
# Converting the input entered in mins or secs to hours,
# mins ,secs(input = 110 min --> 120*60 = 6600 => 1hr :
# 50min: 0sec)
hours=0
if mins >60:
# divmod(firstvalue = temp//60, secondvalue
# = temp%60)
hours, mins = divmod(mins, 60)
# using format () method to store the value up to
# two decimal places
self.hour.set("{0:2d}".format(hours))
self.minute.set("{0:2d}".format(mins))
self.second.set("{0:2d}".format(secs))
# updating the GUI window after decrementing the
# temp value every time
self.window.update()
time.sleep(1)
# when temp value = 0; then a messagebox pop's up
# with a message:"Time's up"
if (temp == 0):
messagebox.showinfo("Time Countdown", "Time's up ")
# after every one sec the value of temp will be decremented
# by one
temp -= 1
def quit_window(self):
self.icon.stop()
self.window.destroy()
def show_window(self):
self.icon.stop()
self.window.protocol('WM_DELETE_WINDOW', self.withdraw_window)
self.window.after(0, self.window.deiconify)
def withdraw_window(self):
self.window.withdraw()
self.icon = pystray.Icon("name", self.image, "title", self.menu)
self.icon.run_detached()
if __name__ in '__main__':
app = Gui()
app.window.protocol('WM_DELETE_WINDOW', app.withdraw_window)
app.window.mainloop()`
I tried to add darwin_nsapplication to icon like self.icon = pystray.Icon("name", self.image, "title", self.menu,self.darwin_nsapplication)
But it is said 6 arguments are given, 2-5 are needed.

How to get all installed font path?

How to get all installed font path with pywin32?
I can only find a way with registry key, but I would prefer to directly use GDI or DirectWrite.
Edit:
I am not sure, but from what I can see, here is how it would maybe be possible with GDI:
Create Factory: https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-dwritecreatefactory
GetSystemFontCollection: https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-idwritefactory-getsystemfontcollection
Do a for loop with GetFontFamilyCount: https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-idwritefontcollection-getfontfamilycount
GetFontFamily: https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-idwritefontcollection-getfontfamily
GetMatchingFonts (the param weight, stretch, style can be anything. These param seems to only change the order or the return list): https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-idwritefontfamily-getmatchingfonts
Do a for loop with GetFontCount:https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-idwritefontlist-getfontcount
GetFont: https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-idwritefontlist-getfont
CreateFontFace: https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-idwritefont-createfontface
GetFiles: https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-idwritefontface-getfiles
GetReferenceKey: https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-idwritefontfile-getreferencekey
Again DWriteCreateFactory but with uuidof IDWriteLocalFontFileLoader
GetFilePathFromKey: https://learn.microsoft.com/en-us/windows/win32/directwrite/idwritelocalfontfileloader-getfilepathfromkey
I found an solution.
I used DirectWrite API.
This code depends on pyglet librairy.
import sys
import time
from ctypes import byref, c_uint32, create_unicode_buffer
from pyglet.font.directwrite import (
DWriteCreateFactory,
DWRITE_FACTORY_TYPE_ISOLATED,
IDWriteFactory,
IDWriteFont,
IDWriteFontCollection,
IDWriteFontFace,
IDWriteFontFamily,
IDWriteFontFile,
IDWriteFontFileLoader,
IDWriteLocalFontFileLoader,
IID_IDWriteFactory,
IID_IDWriteLocalFontFileLoader,
)
from pyglet.libs.win32.types import c_void_p
from typing import List
def get_fonts_filepath() -> List[str]:
"""
Return an list of all the font installed.
"""
write_factory = IDWriteFactory()
DWriteCreateFactory(
DWRITE_FACTORY_TYPE_ISOLATED, IID_IDWriteFactory, byref(write_factory)
)
fonts_path = set()
sys_collection = IDWriteFontCollection()
write_factory.GetSystemFontCollection(byref(sys_collection), 0)
collection_count = sys_collection.GetFontFamilyCount()
for i in range(collection_count):
family = IDWriteFontFamily()
sys_collection.GetFontFamily(i, byref(family))
font_count = family.GetFontCount()
for j in range(font_count):
font = IDWriteFont()
family.GetFont(j, byref(font))
font_face = IDWriteFontFace()
font.CreateFontFace(byref(font_face))
file_ct = c_uint32()
font_face.GetFiles(byref(file_ct), None)
font_files = (IDWriteFontFile * file_ct.value)()
font_face.GetFiles(byref(file_ct), font_files)
pff = font_files[0]
key_data = c_void_p()
ff_key_size = c_uint32()
pff.GetReferenceKey(byref(key_data), byref(ff_key_size))
loader = IDWriteFontFileLoader()
pff.GetLoader(byref(loader))
try:
local_loader = IDWriteLocalFontFileLoader()
loader.QueryInterface(
IID_IDWriteLocalFontFileLoader, byref(local_loader)
)
except OSError: # E_NOTIMPL
font.Release()
font_face.Release()
loader.Release()
pff.Release()
continue
path_len = c_uint32()
local_loader.GetFilePathLengthFromKey(
key_data, ff_key_size, byref(path_len)
)
buffer = create_unicode_buffer(path_len.value + 1)
local_loader.GetFilePathFromKey(key_data, ff_key_size, buffer, len(buffer))
font.Release()
font_face.Release()
loader.Release()
local_loader.Release()
pff.Release()
fonts_path.add(buffer.value)
family.Release()
sys_collection.Release()
write_factory.Release()
return list(fonts_path)
def main():
start = time.time()
fonts_path_dwrite = get_fonts_filepath()
print(time.time() - start)
print(fonts_path_dwrite)
if __name__ == "__main__":
sys.exit(main())

Creating button that clips Entries

I've created a program that is supposed to take the question and entry from a user and copy it to the clipboard. It works fine as a regular program but when I try to adapt it in trying to adapt it to a GUI I am running into an issue. Currently the program is only copying the question and the entries are returning empty strings. I know that if a broke down each entry into its own named variable I could probably fix this issue but a loop seems like a much cleaner solution. Can anyone assist?
import tkinter as tk
from tkinter import *
import pyperclip
system = 'What is the system?'
product = 'What is the product?'
issue = 'What is the issue?'
error = 'Is there an error message?'
screenshot = 'Do you have a screenshot or documentation for this issue?'
impact = 'Is the floor impacted. If so, what is the impact?'
users = 'How many users is this affecting?'
troubleshooting = 'Was there troubleshooting performed?'
changes = 'Are you aware of any changes that may have led up to the issue?'
ticket = 'Do you have an internal ticket number?'
questions = (
system, product, issue, error,
screenshot, impact, users, troubleshooting,
changes, ticket)
entries = []
clip = []
index = 0
index=0
c=0
r=0
root = tk.Tk()
root.title('SysIt4')
top_frame=tk.Frame(root)
bottom_frame=tk.Frame(root)
top_frame.grid(column=0, row=0, sticky=W)
bottom_frame.grid(column=0, row=1)
canvas = tk.Canvas(root, width=600, height=800)
canvas.grid()
while index < 10:
label = tk.Label(top_frame, text=questions[index])
label.grid(columnspan=2, column=c, row=r, sticky=W)
r+=2
index += 1
for r in range(1,20,2):
entry = tk.Entry(top_frame, width=50)
entry.grid(columnspan=2, column=c, row=r, sticky=W, padx=10, pady=5)
entries.append(entry)
def enact_clip(entries, questions):
responses = []
outfile= open('copy.txt', 'w')
for entry in entries:
responses.append(entry.get())
clip = list(zip(questions, responses))
for line in clip:
outfile.write(str(line) + '\n')
outfile.close()
infile = open('copy.txt', 'r')
copy_contents = infile.read()
return pyperclip.copy(copy_contents)
infile.close()
clip_button = Button(bottom_frame, text='Clip', command= enact_clip(entries, questions))
clip_button.grid(column=0, row=1)
root.mainloop()

Detect duplicate videos from YouTube

In consideration to my M.tech Project
I want to know if there is any algorithm to detect duplicate videos from youtube.
For example (here are links of two videos):
random user upload
upload by official channel
Amongst these second is official video and T-series has it's copyright.
Is youtube officially doing something to remove duplicate videos from youtube?
Not only videos, there exists duplicate youtube channels also.
Sometimes the original video has less number of views than that of pirated version.
So, while searching found this
(see page number [49] of pdf)
What I learnt from the given link
Original vs copyright infringed video detection Classifier is used.
Given a query, firstly top k search results are being retrieved.Thereafter three parameters are used to classify the videos
Number of subscribers
user profile
username popularity
and on the basis of these parameters, original video is identified as described in the link.
EDIT 1:
There are basically two different objectives
To identify original video with the above method
To eliminate the duplicate videos
obviously identifying original video is easier than finding out all the duplicate videos.
So i preferred to first find out the original video.
Approach which i can think till now
to improve the accuracy:
We can first find out the original videos with above method
And then use the most popular publicized frames(may be multiple) of that video to search on google image. This method therefore retrieves the list of duplicate videos in google image search results.
After getting these duplicate videos, we can once again check frame by frame and reach a level of satisfaction(yes retrieved videos were "exact or "almost" duplicate copy of original video)
Will this approach work?
if not, is there any better algorithm, to improve upon the given method?
Please write in the comments section if i am unable to explain my approach clearly.
I will soon add some more details.
I've recently hacked together a small tool for that purpose. It's still work in progress but usually pretty accurate. The idea is to simply compare time between brightness maxima in the center of the video. Therefore it should work with different resolutions, frame rates and rotation of the video.
ffmpeg is used for decoding, imageio as bridge to python, numpy/scipy for maxima computation and some k-nearest-neighbor library (annoy, cyflann, hnsw) for comparison.
At the moment it's not polished at all so you should know a little python to run it or simply copy the idea.
Me too had the same problem.. So wrote a program myself..
Problem is I had videos of various formats and resolution.. So needed to take hash of each video frame and compare.
https://github.com/gklc811/duplicate_video_finder
you can just change the directories at top and you are good to go..
from os import path, walk, makedirs, rename
from time import clock
from imagehash import average_hash
from PIL import Image
from cv2 import VideoCapture, CAP_PROP_FRAME_COUNT, CAP_PROP_FRAME_WIDTH, CAP_PROP_FRAME_HEIGHT, CAP_PROP_FPS
from json import dump, load
from multiprocessing import Pool, cpu_count
input_vid_dir = r'C:\Users\gokul\Documents\data\\'
json_dir = r'C:\Users\gokul\Documents\db\\'
analyzed_dir = r'C:\Users\gokul\Documents\analyzed\\'
duplicate_dir = r'C:\Users\gokul\Documents\duplicate\\'
if not path.exists(json_dir):
makedirs(json_dir)
if not path.exists(analyzed_dir):
makedirs(analyzed_dir)
if not path.exists(duplicate_dir):
makedirs(duplicate_dir)
def write_to_json(filename, data):
file_full_path = json_dir + filename + ".json"
with open(file_full_path, 'w') as file_pointer:
dump(data, file_pointer)
return
def video_to_json(filename):
file_full_path = input_vid_dir + filename
start = clock()
size = round(path.getsize(file_full_path) / 1024 / 1024, 2)
video_pointer = VideoCapture(file_full_path)
frame_count = int(VideoCapture.get(video_pointer, int(CAP_PROP_FRAME_COUNT)))
width = int(VideoCapture.get(video_pointer, int(CAP_PROP_FRAME_WIDTH)))
height = int(VideoCapture.get(video_pointer, int(CAP_PROP_FRAME_HEIGHT)))
fps = int(VideoCapture.get(video_pointer, int(CAP_PROP_FPS)))
success, image = video_pointer.read()
video_hash = {}
while success:
frame_hash = average_hash(Image.fromarray(image))
video_hash[str(frame_hash)] = filename
success, image = video_pointer.read()
stop = clock()
time_taken = stop - start
print("Time taken for ", file_full_path, " is : ", time_taken)
data_dict = dict()
data_dict['size'] = size
data_dict['time_taken'] = time_taken
data_dict['fps'] = fps
data_dict['height'] = height
data_dict['width'] = width
data_dict['frame_count'] = frame_count
data_dict['filename'] = filename
data_dict['video_hash'] = video_hash
write_to_json(filename, data_dict)
return
def multiprocess_video_to_json():
files = next(walk(input_vid_dir))[2]
processes = cpu_count()
print(processes)
pool = Pool(processes)
start = clock()
pool.starmap_async(video_to_json, zip(files))
pool.close()
pool.join()
stop = clock()
print("Time Taken : ", stop - start)
def key_with_max_val(d):
max_value = 0
required_key = ""
for k in d:
if d[k] > max_value:
max_value = d[k]
required_key = k
return required_key
def duplicate_analyzer():
files = next(walk(json_dir))[2]
data_dict = {}
for file in files:
filename = json_dir + file
with open(filename) as f:
data = load(f)
video_hash = data['video_hash']
count = 0
duplicate_file_dict = dict()
for key in video_hash:
count += 1
if key in data_dict:
if data_dict[key] in duplicate_file_dict:
duplicate_file_dict[data_dict[key]] = duplicate_file_dict[data_dict[key]] + 1
else:
duplicate_file_dict[data_dict[key]] = 1
else:
data_dict[key] = video_hash[key]
if duplicate_file_dict:
duplicate_file = key_with_max_val(duplicate_file_dict)
duplicate_percentage = ((duplicate_file_dict[duplicate_file] / count) * 100)
if duplicate_percentage > 50:
file = file[:-5]
print(file, " is dup of ", duplicate_file)
src = analyzed_dir + file
tgt = duplicate_dir + file
if path.exists(src):
rename(src, tgt)
# else:
# print("File already moved")
def mv_analyzed_file():
files = next(walk(json_dir))[2]
for filename in files:
filename = filename[:-5]
src = input_vid_dir + filename
tgt = analyzed_dir + filename
if path.exists(src):
rename(src, tgt)
# else:
# print("File already moved")
if __name__ == '__main__':
mv_analyzed_file()
multiprocess_video_to_json()
mv_analyzed_file()
duplicate_analyzer()

How to automatically turn BibTex citation into something parseable by Zotero?

I have a citation system which publishes users notes to a wiki (Researchr). Programmatically, I have access to the full BibTeX record of each entry, and I also display this on the individual pages (for example - click on BibTeX). This is in the interest of making it easy for users of other citation manager to automatically import the citation of a paper that interests them. I would also like other citation managers, especially Zotero, to be able to automatically detect and import a citation.
Zotero lists a number of ways of exposing metadata that it will understand, including meta tags with RDF, COiNS, Dublin Core and unAPI. Is there a Ruby library for converting BibTeX to any of these standards automatically - or a Javascript library? I could probably create something, but if something existed, it would be far more robust (BibTeX has so many publication types and fields etc).
There's a BibTeX2RDF convertor available here, might be what you're after.
unAPI is not a data standard - it's a way to serve data (to Zotero and other programs). Zotero imports Bibtex, so serving Bibtex via unAPI works just fine. Inspire is an example of a site that does that:
http://inspirehep.net/
By now one can simply import bibtex files of type .bib directly in Zotero. However, I noticed my bibtex files were often less complete than Zotero (in particular they often missed a DOI), and I did not find an "auto-complete" function (based on the data in the bibtex entries) in Zotero.
So I import the .bib file with Zotero, to ensure they are all in there. Then I run a python script that gets all the missing DOI's it can find for the entries in that .bib file, and exports them to a space separated .txt file.:
# pip install habanero
from habanero import Crossref
import re
def titletodoi(keyword):
cr = Crossref()
result = cr.works(query=keyword)
items = result["message"]["items"]
item_title = items[0]["title"]
tmp = ""
for it in item_title:
tmp += it
title = keyword.replace(" ", "").lower()
title = re.sub(r"\W", "", title)
# print('title: ' + title)
tmp = tmp.replace(" ", "").lower()
tmp = re.sub(r"\W", "", tmp)
# print('tmp: ' + tmp)
if title == tmp:
doi = items[0]["DOI"]
return doi
else:
return None
def get_dois(titles):
dois = []
for title in titles:
try:
doi = titletodoi(title)
print(f"doi={doi}, title={title}")
if not doi is None:
dois.append(doi)
except:
pass
# print("An exception occurred")
print(f"dois={dois}")
return dois
def read_titles_from_file(filepath):
with open(filepath) as f:
lines = f.read().splitlines()
split_lines = splits_lines(lines)
return split_lines
def splits_lines(lines):
split_lines = []
for line in lines:
new_lines = line.split(";")
for new_line in new_lines:
split_lines.append(new_line)
return split_lines
def write_dois_to_file(dois, filename, separation_char):
textfile = open(filename, "w")
for doi in dois:
textfile.write(doi + separation_char)
textfile.close()
filepath = "list_of_titles.txt"
titles = read_titles_from_file(filepath)
dois = get_dois(titles)
write_dois_to_file(dois, "dois_space.txt", " ")
write_dois_to_file(dois, "dois_per_line.txt", "\n")
The DOIs of the .txt are fed into magic wand of Zotero. Next, I (manually) remove the duplicates by choosing the latest added entry (because that comes from the magic wand with the most data).
After that, I run another script to update all the reference id's in my .tex and .bib files to those generated by Zotero:
# Importing library
import bibtexparser
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import *
import os, fnmatch
import Levenshtein as lev
# Let's define a function to customize our entries.
# It takes a record and return this record.
def customizations(record):
"""Use some functions delivered by the library
:param record: a record
:returns: -- customized record
"""
record = type(record)
record = author(record)
record = editor(record)
record = journal(record)
record = keyword(record)
record = link(record)
record = page_double_hyphen(record)
record = doi(record)
return record
def get_references(filepath):
with open(filepath) as bibtex_file:
parser = BibTexParser()
parser.customization = customizations
bib_database = bibtexparser.load(bibtex_file, parser=parser)
# print(bib_database.entries)
return bib_database
def get_reference_mapping(main_filepath, sub_filepath):
found_sub = []
found_main = []
main_into_sub = []
main_references = get_references(main_filepath)
sub_references = get_references(sub_filepath)
for main_entry in main_references.entries:
for sub_entry in sub_references.entries:
# Match the reference ID if 85% similair titles are detected
lev_ratio = lev.ratio(
remove_curly_braces(main_entry["title"]).lower(),
remove_curly_braces(sub_entry["title"]).lower(),
)
if lev_ratio > 0.85:
print(f"lev_ratio={lev_ratio}")
if main_entry["ID"] != sub_entry["ID"]:
print(f'replace: {sub_entry["ID"]} with: {main_entry["ID"]}')
main_into_sub.append([main_entry, sub_entry])
# Keep track of which entries have been found
found_sub.append(sub_entry)
found_main.append(main_entry)
return (
main_into_sub,
found_main,
found_sub,
main_references.entries,
sub_references.entries,
)
def remove_curly_braces(string):
left = string.replace("{", "")
right = left.replace("{", "")
return right
def replace_references(main_into_sub, directory):
for pair in main_into_sub:
main = pair[0]["ID"]
sub = pair[1]["ID"]
print(f"replace: {sub} with: {main}")
# UNCOMMENT IF YOU WANT TO ACTUALLY DO THE PRINTED REPLACEMENT
# findReplace(latex_root_dir, sub, main, "*.tex")
# findReplace(latex_root_dir, sub, main, "*.bib")
def findReplace(directory, find, replace, filePattern):
for path, dirs, files in os.walk(os.path.abspath(directory)):
for filename in fnmatch.filter(files, filePattern):
filepath = os.path.join(path, filename)
with open(filepath) as f:
s = f.read()
s = s.replace(find, replace)
with open(filepath, "w") as f:
f.write(s)
def list_missing(main_references, sub_references):
for sub in sub_references:
if not sub["ID"] in list(map(lambda x: x["ID"], main_references)):
print(f'the following reference has a changed title:{sub["ID"]}')
latex_root_dir = "some_path/"
main_filepath = f"{latex_root_dir}latex/Literature_study/zotero.bib"
sub_filepath = f"{latex_root_dir}latex/Literature_study/references.bib"
(
main_into_sub,
found_main,
found_sub,
main_references,
sub_references,
) = get_reference_mapping(main_filepath, sub_filepath)
replace_references(main_into_sub, latex_root_dir)
list_missing(main_references, sub_references)
# For those references which have levenshtein ratio below 85 you can specify a manual swap:
manual_swap = [] # main into sub
# manual_swap.append(["cantley_impact_2021","cantley2021impact"])
# manual_swap.append(["widemann_envision_2021","widemann2020envision"])
for pair in manual_swap:
main = pair[0]
sub = pair[1]
print(f"replace: {sub} with: {main}")
# UNCOMMENT IF YOU WANT TO ACTUALLY DO THE PRINTED REPLACEMENT
# findReplace(latex_root_dir, sub, main, "*.tex")
# findReplace(latex_root_dir, sub, main, "*.bib")

Resources