import numpy as np
from elastipy import Search, query
They always say: put the imports at the top!
git commit analytics
Below we use a lot of pandas
and plotting to get insight into
the community of an open source project.
To explore a repository of your choice move to elastipy/examples/
and call:
python gitlogs.py <project-name> path/to/git-repo
If you are cloning
a repository and are just interested in commits
you can somewhat limit the size on disk with:
git clone <repo-url> --no-checkout
Replace the <project-name>
with the name of the project and change
the value below in the notebook
:
PROJECT = "pandas"
def search():
return Search(f"elastipy-example-commits-{PROJECT}")
activity
commits per week
s = search()
agg = s.agg_date_histogram("date", calendar_interval="week")
df = agg.execute().df(to_index=True)
df["commits/week"] = df.pop("date.doc_count")
df["smooth"] = df.rolling(window=50).mean()
df.plot(figsize=(15,4), color=["lightblue", "blue"])
additions/deletions per week
s = search()
agg = s.agg_date_histogram("date", calendar_interval="month")
agg.metric_sum("add", field="changes.additions")
agg.metric_sum("del", field="changes.deletions")
df = agg.execute().df(to_index=True, exclude="*doc_count")
#df = df.rolling(window=10).mean()[["add", "del"]]
df.plot.line(color=["green", "pink"], figsize=(15,4))
commits per weekday/hour for each year
def commits_per(field, interval="year"):
s = search()
agg = s.agg_date_histogram(interval, calendar_interval=interval)
#agg = s.agg_terms("author", field="author")
agg = agg.agg_terms("weekday", field=field, size=100)
agg.execute().plot.heatmap(
sort=True, transpose=True,
annot=False, fmt=".0f", cmap="gray_r", figsize=(15, .3),
)
commits_per("timestamp_weekday")
commits_per("timestamp_hour")
commit messages
the first ten commit messages
s = search().sort("timestamp")
# s = s.range("timestamp", gte="2020")
for d in s.execute().documents:
print(("-- %(timestamp)s %(hash)s\n%(message)s" % d).strip() + "\n")
-- 2009-07-31T15:07:16+00:00 9d0080576446de475d34b0dbb58389b15cd4f529
Initial directory structure.
git-svn-id: http://pandas.googlecode.com/svn/trunk@1 d5231056-7de3-11de-ac95-d976489f1ece
-- 2009-08-05T02:32:49+00:00 ec1a0a2a2571dc2c1c26612b374d4a66b22f0938
adding trunk
git-svn-id: http://pandas.googlecode.com/svn/trunk@2 d5231056-7de3-11de-ac95-d976489f1ece
-- 2009-08-05T02:33:13+00:00 1eeadf4e401647faa20911f531bc05c1872262ea
oops
git-svn-id: http://pandas.googlecode.com/svn/trunk@3 d5231056-7de3-11de-ac95-d976489f1ece
-- 2009-08-05T03:17:29+00:00 445114e1b20da8d4976c8d9050aa90c5bd508c54
added svn:ignore
git-svn-id: http://pandas.googlecode.com/svn/trunk@4 d5231056-7de3-11de-ac95-d976489f1ece
-- 2009-08-05T03:30:16+00:00 c6b236db73ff81007909be6406f0e484edc4a9eb
first commit with cleaned up code
git-svn-id: http://pandas.googlecode.com/svn/trunk@5 d5231056-7de3-11de-ac95-d976489f1ece
-- 2009-08-05T03:40:05+00:00 c8efebf2bfbe6a1efc732679ad3cf2d06d795c3f
minor edit
git-svn-id: http://pandas.googlecode.com/svn/trunk@6 d5231056-7de3-11de-ac95-d976489f1ece
-- 2009-08-05T03:54:33+00:00 21e01d94a0632539f76eb702408540b0d9adcb59
fixed isinf reference
git-svn-id: http://pandas.googlecode.com/svn/trunk@7 d5231056-7de3-11de-ac95-d976489f1ece
-- 2009-09-01T15:10:47+00:00 0f6d8b435670053a393b65c621d6eab090a36633
latest edits, miscellaneous cleanup and bug fixes from development
git-svn-id: http://pandas.googlecode.com/svn/trunk@8 d5231056-7de3-11de-ac95-d976489f1ece
-- 2009-09-01T15:13:32+00:00 171487fd4ea85aa38b224ee3cd5c41356063e197
added stats empty directory
git-svn-id: http://pandas.googlecode.com/svn/trunk@9 d5231056-7de3-11de-ac95-d976489f1ece
-- 2009-09-01T15:50:21+00:00 39c033cbe697b488f6f612c9d154a467aaca76a1
fixed inconsistency with dateCol parameter
git-svn-id: http://pandas.googlecode.com/svn/trunk@10 d5231056-7de3-11de-ac95-d976489f1ece
significant terms by year
def significant_terms_by_year(s, field, size=4, shard_size=100):
agg = s.copy().agg_date_histogram("year", calendar_interval="year")
agg = agg.agg_significant_terms(field=field, size=size, shard_size=shard_size)
keywords = set(k[-1] for k in agg.execute().keys())
agg = s.agg_date_histogram("date", calendar_interval="year")
agg = agg.agg_filters("word", filters={key: query.Term(field, key) for key in keywords})
agg.execute().plot.heatmap(
sort=True, replace={0: np.nan},
transpose=True, annot=True, fmt=".0f", figsize=(.3, .7), cmap="gray_r"
)
significant_terms_by_year(search(), "message")
files
overall top 50 edited files per year
s = search()
agg = s.agg_terms(field="changes.file", size=50)
agg = agg.agg_date_histogram("date", calendar_interval="year")
df = agg.execute().plot.heatmap(
sort=True, replace={0: np.nan},
annot=True, fmt=".0f", figsize=(.3, 1.5), cmap="gray_r"
)
significant changed files by year
s = search().param(rest_total_hits_as_int=True)
# remove version specific files
s = ~s.query_string("changes.file: *.txt *.rst")
significant_terms_by_year(s, "changes.file")
which files get edited together
s = search()
s = s.query_string("changes.file: __init__.py")
agg = s.agg_terms(field="changes.file", size=50)
agg = agg.agg_date_histogram("date", calendar_interval="year")
try:
agg.execute().plot.heatmap(figsize=(.3, 1.5), cmap="gray_r")
except ValueError:
pass