I'd like to create some statistics for open source projects. I'd like to extract the information when a certain person started to work on a git repository and when they finished on it... or the first and the last commit by an author, with a date.
I can manually extract this information from git log output. However, is there a git built-in way, or one-liner shell script that would help me to analyse this.
Find all repo authors from git log
Get their first and last commit date
For usernames and commits amount
git shortlog -sne | awk '{print $1 " " $2}'
Last commit date
git log --pretty=format:"%ad by %an" --date=iso | sort -r | awk '{if (!seen[$5]++) print}'
First commit date
git log --pretty=format:"%ad by %an" --date=iso | sort | awk '{if (!seen[$5]++) print}'
Here's something to get you started, it will list the authors and how many commits from each
git log --format='%aN <%aE>' | awk '{arr[$0]++} END{for (i in arr){print arr[i], i;}}' | sort -rn
I had to do this over multiple repositories, so shell tools provided inflexible. I wrote a quick Python script that does the job. README here.
import sys
import logging
import os.path
from datetime import datetime
from collections import defaultdict
from typing import List
from typing import Optional
from typing import Dict
from dataclasses import dataclass
from dataclasses import field
from pathlib import Path
from git import Repo
from tabulate import tabulate
# If logging is set to debug you will see individual git commands in the console
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("githistory")
#dataclass
class AuthorHistory:
"""Author history information for a single repo"""
repo: str
name: str
email: str
first_commit_at: datetime = None
first_commit_message: str = None
last_commit_at: datetime = None
last_commit_message: str = None
commit_count: int = 0
#dataclass
class AuthorHistoryOverMultipleRepos:
"""Author history information spanning over multiple repos"""
# repo name -> history
histories: Dict[str, AuthorHistory] = field(default_factory=dict)
first_commit_at: datetime = datetime(2030, 1, 1)
last_commit_at: datetime = datetime(1970, 1, 1)
commit_count: int = 0
#property
def email(self):
return next(iter(self.histories.values())).email
#property
def name(self):
return next(iter(self.histories.values())).name
#dataclass
class RepositoryHistory:
"""History of a single repo"""
name: str
commit_count: int = 0
#: email -> history maps
authors: Dict[str, AuthorHistory] = field(default_factory=dict)
#dataclass
class FullHistory:
"""History of a project spanning multiple repositories"""
repos: List[RepositoryHistory] = field(default_factory=list)
#: Pointers to individiaul author histories
#: email -> repo -> AuthorHistory
all_author_histories: Dict[str, AuthorHistoryOverMultipleRepos] = field(default_factory=dict)
def extract_history(path: Path) -> Optional[RepositoryHistory]:
"""Extract history of one git repository.
#param :repo: Path to git repository
#param :all_author_histories: Track author work across multiple repos
"""
logger.info("Extracting history from %s", path)
r = Repo(path)
repo_name = os.path.split(path)[-1]
# Sanity check
heads = r.heads
if len(heads) == 0:
logger.warning("Not a git repository: %s", path)
return None
master = heads.master
history = RepositoryHistory(name=repo_name)
authors = history.authors
# This will iterate commits from firs to last
all_commits = list(r.iter_commits('master'))
for c in all_commits: # type: Commit
# https://gitpython.readthedocs.io/en/stable/reference.html#git.objects.commit.Commit
# https://stackoverflow.com/questions/58550252/how-to-get-commit-author-name-and-email-with-gitpython
name = c.author.name
email = c.author.email
author = authors.get(email) # type: AuthorHistory
if not author:
# We are initialising this author
author = AuthorHistory(name, email, repo_name)
authors[email] = author
author.first_commit_at = datetime.fromtimestamp(c.committed_date) # Is UNIX time
author.first_commit_message = c.message
author.last_commit_at = datetime.fromtimestamp(c.committed_date)
author.last_commit_message = author.last_commit_message
else:
# Adding more commits for the author
author.last_commit_at = datetime.fromtimestamp(c.committed_date)
author.last_commit_message = c.message
author.commit_count += 1
history.commit_count += 1
return history
def mine_authors_over_repos(history: List[RepositoryHistory]) -> Dict[str, AuthorHistoryOverMultipleRepos]:
"""Create a history info spanning over multiple repos."""
all_author_histories = defaultdict(AuthorHistoryOverMultipleRepos)
for r in history.repos:
for email, history in r.authors.items():
all_history = all_author_histories[email]
all_history.first_commit_at = min(all_history.first_commit_at, history.first_commit_at)
all_history.last_commit_at = max(all_history.last_commit_at, history.last_commit_at)
all_history.commit_count += history.commit_count
all_history.histories[r.name] = history
print("set history ", r.name, history)
return all_author_histories
def mine_data(repos: List[str]) -> FullHistory:
"""Extract history from multiple git repositories.
Will skip directories that do not look like git repos.
"""
logger.info("Working on %d repositores", len(repos))
history = FullHistory()
for repo in repos:
repo_history = extract_history(Path(repo))
if repo_history:
history.repos.append(repo_history)
history.all_author_histories = mine_authors_over_repos(history)
return history
def output_author_data(history: FullHistory):
"""Write out information about authors"""
print("All authors")
print("*" * 80)
table = []
for author in history.all_author_histories.values():
table.append([author.name, author.email, author.first_commit_at, author.last_commit_at, author.commit_count])
# Sort by the first commit
table = sorted(table, key=lambda row: row[2])
print(tabulate(table, headers=["Email", "Name", "First commit", "Last commit", "Commit count"]))
print()
def main():
"""Entry point"""
history = mine_data(sys.argv[1:])
output_author_data(history)
if __name__ == "__main__":
main()
Related
from btalib.indicators import sma
import pandas as pd
import backtrader as bt
import os.path #To manage paths
import sys # to find out the script name
import datetime
import matplotlib as plt
from backtrader import cerebro
from numpy import mod #for datetime object
df = pd.read_csv('C:/Users/User/Desktop/programming/dataset/coin_Bitcoin.csv',parse_dates=True, index_col='Date')
sma14 = btalib.sma(df, period = 14)
sma5 = btalib.sma(df, period=5)
class TestStrategy(bt.Strategy):
params = (
('exitbars', 5),
)
def log(self, txt, dt=None):
#Logging function fot this strategy
dt = dt or self.datas[0].datetime.date(0)
print('%s, %s' % (dt.isoformat(), txt))
def __init__(self):
# Keep a reference to the "close" line in the data[0] dataseries
self.dataclose = self.datas[0].close
# To keep track of pending orders
self.order = None
self.buyprice = None
self.buycomm = None
def notify_order(self, order):
if order.status in [order.Submitted, order.Accepted]:
# Buy/Sell order submitted/accepted to/by broker - Nothing to do
return
if order.status in [order.Completed]:
if order.isbuy():
self.log(
'BUY EXECUTED, Price: %.2f, Cost: %.2f, Comm: %.2f' %
(order.executed.price,
order.executed.value,
order.executed.comm))
self.buyprice = order.executed.price
self.buycomm = order.executed.comm
else: #sell
self.log('SELL EXECUTED, Price: %.2f, Cost: %.2f, Comm %.2f'%
(order.executed.price,
order.executed.value,
order.executed.comm))
self.bar_executed = len(self)
elif order.status in [order.Canceled, order.Margin, order.Rejected]:
self.log('Order Canceled/Margin/Reject')
# Write down: no pending order
self.order = None
# Check if an order has been completed
# Attention: broker could reject order if not enough cash
def notify_trade(self, trade):
if not trade.isclosed:
return
self.log('OPERATION PROFIT, GROSS %.2f, NET %.2f' %
(trade.pnl, trade.pnlcomm))
def next(self):
#sma = btalib.sma(df, period=30)
# Simply log the closing price of the series from the reference
self.log('Close, %.2f' % self.dataclose[0])
# Check if an order is pending ... if yes, we cannot send a 2nd one
if self.order:
return
# Check if we are in the market
#if not self.position:
# Not yet ... we MIGHT BUY if ...
if sma5[0] > sma14[0]:
# BUY, BUY, BUY!!! (with all possible default parameters)
self.log('BUY CREATE, %.2f' % self.dataclose[0])
# Keep track of the created order to avoid a 2nd order
self.order = self.buy()
else:
# Already in the market ... we might sell
if sma5[0] < sma14[0]:
# SELL, SELL, SELL!!! (with all possible default parameters)
self.log('S[enter image description here][1]ELL CREATE, %.2f' % self.dataclose[0])
self.order = self.sell()
if __name__ == '__main__':
# Create a cerebro entity
cerebro = bt.Cerebro()
# Add a strategy
cerebro.addstrategy(TestStrategy)
modpath = os.path.dirname(os.path.abspath(sys.argv[0]))
datapath = os.path.join(modpath, 'C:/programming/AlgoTrading/backtest/BTC-USD-YF.csv')
data = bt.feeds.YahooFinanceCSVData(
dataname = datapath,
fromdate = datetime.datetime(2020,5,1),
todate = datetime.datetime(2021,6,1),
reverse = False)
#Add the Data Feed to Cerebro
cerebro.adddata(data)
cerebro.broker.setcash(100000.0)
# Add a FixedSize sizer according to the stake
#cerebro.addsizer(bt.sizers.FixedSize, stake=10)
cerebro.addsizer(bt.sizers.FixedSize)
# Set the commission
cerebro.broker.setcommission(commission=0.0)
# Print out the starting conditions
print('Starting Portfolio Value: %.2f' % cerebro.broker.getvalue())
# Run over everything
cerebro.run()
#print(df(data))
# Print out the final result
print('Final Portfolio Value: %.2f' % cerebro.broker.getvalue())
cerebro.plot()
I tried so hard to order buy when sma5>sma14 and sell at sma5<sma14 but it doesn't work
I use backtrader as backtesting library and use btalib for indicator o generate signal where "btalib.sma(df, period)"
cerebro function is what backtesting module
sometimes it's buy and sell everyday, buy today sell tomorrow
Probably you have to invert the order of your df, this was my problem when computing the RSI with btalib.
Example: df = df.iloc[::-1]
Am getting this error message when deploying a streamlit app to Heroku and do not know how to troubleshoot it. I have deployed apps using this setup previously, and removing the config.toml file has always fixed it. But this time is different.
I went to Heroku's website to find help and no luck so far: https://discuss.streamlit.io/t/tomldecodeerror-key-name-found-without-value-reached-end-of-line-when-deploying-on-heroku/12285
It looks like this questions has left others hanging as well, since this question and this one have the same error message and there has not been an answer so far.
Here is my setup.sh file, which I believe is causing the problem:
mkdir -p ~/.streamlit/
echo "\
[general]\n\
email = \"myemailgmail.com\"\n\
" > ~/.streamlit/credentials.toml
echo "\
[server]\n\
headless = true\n\
enableCORS=false\n\
port = $PORT\n\
" > ~/.streamlit/config.toml
Here is my Procfile:
web: sh setup.sh && streamlit run app.py
Here is the application file itself, app.py:
import streamlit as st
import pandas as pd
import pickle
import plotly.express as px
#st.cache
def load_data(n_rows=3000):
data = pd.read_csv('https://raw.githubusercontent.com/JonathanBechtel/dat-02-22/main/ClassMaterial/Unit3/data/ks2.csv', nrows=n_rows)
return data
#st.cache
def group_data(x_axis, y_axis):
result = data.groupby(x_axis)[y_axis].mean()
return result
#st.cache
def load_model():
with open('mod.pkl', 'rb') as mod:
pipe = pickle.load(mod)
return pipe
st.title("Understanding Kickstarter Applications -- See How Easy It Is Cindy?")
section = st.sidebar.radio('Section', ['Data Explorer', 'Model Predictions'])
n_rows = st.sidebar.number_input("Enter Number of Rows To Load", min_value=1000, max_value=100000, step=1000)
data = load_data(n_rows)
if section == 'Data Explorer':
chart_type = st.sidebar.selectbox('Chart Type', ['Bar', 'Line', 'Strip'])
st.write(data)
x_axis = st.sidebar.selectbox('Choose Column for X-Axis', ['category', 'main_category', 'country'])
y_axis = st.sidebar.selectbox('Choose Column for y-axis', ['state', 'goal'])
st.header(f"Average value for {y_axis} for column {x_axis}")
if chart_type == 'Bar':
result = group_data(x_axis, y_axis)
st.bar_chart(result)
elif chart_type == 'Line':
result = group_data(x_axis, y_axis)
st.line_chart(result)
else:
result = data[[x_axis, y_axis]]
st.plotly_chart(px.strip(result, x=x_axis, y=y_axis, color=x_axis))
elif section == 'Model Predictions':
with open('mod.pkl', 'rb') as mod:
pipe = pickle.load(mod)
print(pipe)
category = st.sidebar.selectbox('Select A Category', data['category'].unique().tolist())
main_category = st.sidebar.selectbox('Select a Main Category', data['main_category'].unique().tolist())
funding_amount = st.sidebar.number_input('Enter Your Funding Amount', min_value=0, value=1000, step=500)
sample = pd.DataFrame({
'category': [category],
'main_category': [main_category],
'funding_amount': [funding_amount]
})
prediction = pipe.predict_proba(sample)
st.header(f"Predicted Probability of Campaign Successs: {prediction[0][1]:.2%}")
My requirements.txt file reads like this:
streamlit == 0.67.0
scikit-learn
pandas
category_encoders == 2.*
xgboost == 1.3.*
protobuf == 3.13.0
plotly == 4.12.0
Any recommendations are welcome, because all wells have been run dry at this point.
You can try :
mkdir -p ~/.streamlit/
echo "[server]\nheadless = true\nport = $PORT\nenableCORS = false\n" > ~/.streamlit/config.toml
written small snippet to automate git add , commit and push using pythongit.
def git_commit_push(self):
repoDir = self.backupRepositoryPath
repo = git.Repo( repoDir )
print repo.git.status()
repo.git.add('--all')
print repo.git.status()
repo.git.commit( m='pusing for backup' )
repo.git.push()
print repo.git.status()
Need to add below mentioned check points
1: Before commit , check any files are modified. If no files then skip commit
2: Before push , check any committed files to be pushed. If no files then skip push
Please help writing the if condition for these two check points.
Regards,
Prasad
Logic is tuned here...
def git_commit_push(self):
repoDir = self.backupRepositoryPath
repo = git.Repo( repoDir )
print repo.git.status()
repo.git.add('--all')
changedFiles = repo.index.diff("HEAD")
print "====================================="
print "changedFiles are :", changedFiles
print "====================================="
if ( changedFiles ):
repo.git.commit( m='JenkinsBackup' )
repo.git.push()
else:
print "No files updated"
Hope this should help.
def git_commit_push(self):
repoDir = self.backupRepositoryPath
repo = git.Repo( repoDir )
print repo.git.status()
repo.git.add('--all')
changedFiles = repo.git.diff('HEAD~1..HEAD', name_only=True)
print "====================================="
print "changedFiles are :", changedFiles
print "====================================="
if ( changedFiles ):
repo.git.commit( m=changedFiles )
repo.git.push()
else:
print "No files updated"
I have two XMLs which look like below. How to merge 2 of them with scripting to take values from new.xml & retainging non-existing attribute-value form base.xml
base.xml:
<element name="ind"
dbs="name1, name2, name4"
server="ServerName"
good-attribute="234"/>
My new.xml looks like this:
<element name="ind"
description="My desc"
dbId="someId"
moreAttr="someVal"
dbs="name1, name2, name4, name12, name3"
server="ServerName" />
I would like to take latest dbs value from new.xml to merge.xml and with retaining good-attribute value present only in base.xml:
merge.xml:
<element name="ind"
description="My desc"
dbId="someId"
moreAttr="someVal"
dbs="name1, name2, name4, name12, name3"
server="ServerName"
good-attribute="234" />
With paste just paste fieldwise and vimdiff shows diff, but does not allow to select. Any inbuild support is there or need it to be implemented with replacements with sed + awk?
Here's a direct implementation in Python:
#!/usr/bin/env python
# Python 2.5+ compatible
import sys
import xml.etree.ElementTree as etree
def main():
output_file = getattr(sys.stdout, 'buffer', sys.stdout)
dict2xml(merge_dicts(*map(xml2dict, sys.argv[1:])), output_file)
def xml2dict(source):
return dict((el.get('name'), el.attrib)
for el in etree.parse(source).getiterator('element'))
def merge_dicts(base, new):
merged = {}
# for each element from new xml
for name, attr in new.items():
# combine attributes from base and new xmls preferring new values
d = merged[name] = base.get(name, {})
d.update(attr)
return merged
def dict2xml(d, sink):
root = etree.Element('root')
for name, attr in d.items():
etree.SubElement(root, 'element', attr)
etree.ElementTree(root).write(sink, encoding='utf-8')
main()
Save this code to merge-xml file and run chmod +x merge-xml. Then:
$ ./merge-xml base.xml new.xml >merge.xml
Here's Python 2.4+ compatible version:
#!/usr/bin/env python
import sys
from xml.dom import minidom
def main():
output_file = getattr(sys.stdout, 'buffer', sys.stdout)
dict2xml(merge_dicts(*map(xml2dict, sys.argv[1:])), output_file)
def xml2dict(source):
doc = minidom.parse(source)
return dict((el.getAttribute('name'), attr2dict(el.attributes))
for el in doc.getElementsByTagName('element'))
def attr2dict(nodemap):
d = {}
for i in range(nodemap.length):
attr = nodemap.item(i)
d[attr.name] = attr.value
return d
def merge_dicts(base, new):
merged = {}
# for each element from new xml
for name, attr in new.items():
# combine attributes from base and new xmls preferring new values
d = merged[name] = base.get(name, {})
d.update(attr)
return merged
def dict2xml(d, sink):
doc = minidom.getDOMImplementation().createDocument(None, "root", None)
root = doc.documentElement
for name, attr in d.items():
el = doc.createElement('element')
for name, value in attr.items():
el.setAttribute(name, value)
root.appendChild(el)
sink.write(doc.toprettyxml(encoding='utf-8'))
main()
I have a citation system which publishes users notes to a wiki (Researchr). Programmatically, I have access to the full BibTeX record of each entry, and I also display this on the individual pages (for example - click on BibTeX). This is in the interest of making it easy for users of other citation manager to automatically import the citation of a paper that interests them. I would also like other citation managers, especially Zotero, to be able to automatically detect and import a citation.
Zotero lists a number of ways of exposing metadata that it will understand, including meta tags with RDF, COiNS, Dublin Core and unAPI. Is there a Ruby library for converting BibTeX to any of these standards automatically - or a Javascript library? I could probably create something, but if something existed, it would be far more robust (BibTeX has so many publication types and fields etc).
There's a BibTeX2RDF convertor available here, might be what you're after.
unAPI is not a data standard - it's a way to serve data (to Zotero and other programs). Zotero imports Bibtex, so serving Bibtex via unAPI works just fine. Inspire is an example of a site that does that:
http://inspirehep.net/
By now one can simply import bibtex files of type .bib directly in Zotero. However, I noticed my bibtex files were often less complete than Zotero (in particular they often missed a DOI), and I did not find an "auto-complete" function (based on the data in the bibtex entries) in Zotero.
So I import the .bib file with Zotero, to ensure they are all in there. Then I run a python script that gets all the missing DOI's it can find for the entries in that .bib file, and exports them to a space separated .txt file.:
# pip install habanero
from habanero import Crossref
import re
def titletodoi(keyword):
cr = Crossref()
result = cr.works(query=keyword)
items = result["message"]["items"]
item_title = items[0]["title"]
tmp = ""
for it in item_title:
tmp += it
title = keyword.replace(" ", "").lower()
title = re.sub(r"\W", "", title)
# print('title: ' + title)
tmp = tmp.replace(" ", "").lower()
tmp = re.sub(r"\W", "", tmp)
# print('tmp: ' + tmp)
if title == tmp:
doi = items[0]["DOI"]
return doi
else:
return None
def get_dois(titles):
dois = []
for title in titles:
try:
doi = titletodoi(title)
print(f"doi={doi}, title={title}")
if not doi is None:
dois.append(doi)
except:
pass
# print("An exception occurred")
print(f"dois={dois}")
return dois
def read_titles_from_file(filepath):
with open(filepath) as f:
lines = f.read().splitlines()
split_lines = splits_lines(lines)
return split_lines
def splits_lines(lines):
split_lines = []
for line in lines:
new_lines = line.split(";")
for new_line in new_lines:
split_lines.append(new_line)
return split_lines
def write_dois_to_file(dois, filename, separation_char):
textfile = open(filename, "w")
for doi in dois:
textfile.write(doi + separation_char)
textfile.close()
filepath = "list_of_titles.txt"
titles = read_titles_from_file(filepath)
dois = get_dois(titles)
write_dois_to_file(dois, "dois_space.txt", " ")
write_dois_to_file(dois, "dois_per_line.txt", "\n")
The DOIs of the .txt are fed into magic wand of Zotero. Next, I (manually) remove the duplicates by choosing the latest added entry (because that comes from the magic wand with the most data).
After that, I run another script to update all the reference id's in my .tex and .bib files to those generated by Zotero:
# Importing library
import bibtexparser
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import *
import os, fnmatch
import Levenshtein as lev
# Let's define a function to customize our entries.
# It takes a record and return this record.
def customizations(record):
"""Use some functions delivered by the library
:param record: a record
:returns: -- customized record
"""
record = type(record)
record = author(record)
record = editor(record)
record = journal(record)
record = keyword(record)
record = link(record)
record = page_double_hyphen(record)
record = doi(record)
return record
def get_references(filepath):
with open(filepath) as bibtex_file:
parser = BibTexParser()
parser.customization = customizations
bib_database = bibtexparser.load(bibtex_file, parser=parser)
# print(bib_database.entries)
return bib_database
def get_reference_mapping(main_filepath, sub_filepath):
found_sub = []
found_main = []
main_into_sub = []
main_references = get_references(main_filepath)
sub_references = get_references(sub_filepath)
for main_entry in main_references.entries:
for sub_entry in sub_references.entries:
# Match the reference ID if 85% similair titles are detected
lev_ratio = lev.ratio(
remove_curly_braces(main_entry["title"]).lower(),
remove_curly_braces(sub_entry["title"]).lower(),
)
if lev_ratio > 0.85:
print(f"lev_ratio={lev_ratio}")
if main_entry["ID"] != sub_entry["ID"]:
print(f'replace: {sub_entry["ID"]} with: {main_entry["ID"]}')
main_into_sub.append([main_entry, sub_entry])
# Keep track of which entries have been found
found_sub.append(sub_entry)
found_main.append(main_entry)
return (
main_into_sub,
found_main,
found_sub,
main_references.entries,
sub_references.entries,
)
def remove_curly_braces(string):
left = string.replace("{", "")
right = left.replace("{", "")
return right
def replace_references(main_into_sub, directory):
for pair in main_into_sub:
main = pair[0]["ID"]
sub = pair[1]["ID"]
print(f"replace: {sub} with: {main}")
# UNCOMMENT IF YOU WANT TO ACTUALLY DO THE PRINTED REPLACEMENT
# findReplace(latex_root_dir, sub, main, "*.tex")
# findReplace(latex_root_dir, sub, main, "*.bib")
def findReplace(directory, find, replace, filePattern):
for path, dirs, files in os.walk(os.path.abspath(directory)):
for filename in fnmatch.filter(files, filePattern):
filepath = os.path.join(path, filename)
with open(filepath) as f:
s = f.read()
s = s.replace(find, replace)
with open(filepath, "w") as f:
f.write(s)
def list_missing(main_references, sub_references):
for sub in sub_references:
if not sub["ID"] in list(map(lambda x: x["ID"], main_references)):
print(f'the following reference has a changed title:{sub["ID"]}')
latex_root_dir = "some_path/"
main_filepath = f"{latex_root_dir}latex/Literature_study/zotero.bib"
sub_filepath = f"{latex_root_dir}latex/Literature_study/references.bib"
(
main_into_sub,
found_main,
found_sub,
main_references,
sub_references,
) = get_reference_mapping(main_filepath, sub_filepath)
replace_references(main_into_sub, latex_root_dir)
list_missing(main_references, sub_references)
# For those references which have levenshtein ratio below 85 you can specify a manual swap:
manual_swap = [] # main into sub
# manual_swap.append(["cantley_impact_2021","cantley2021impact"])
# manual_swap.append(["widemann_envision_2021","widemann2020envision"])
for pair in manual_swap:
main = pair[0]
sub = pair[1]
print(f"replace: {sub} with: {main}")
# UNCOMMENT IF YOU WANT TO ACTUALLY DO THE PRINTED REPLACEMENT
# findReplace(latex_root_dir, sub, main, "*.tex")
# findReplace(latex_root_dir, sub, main, "*.bib")