elastipy

Welcome, Seeker of Aggregated Data!

May thou find the information about the usage of elastipy on these pages.

Overview

configuration

By default an elasticsearch host is expected at localhost:9200. There are currently two ways to specify a different connection.

from elasticsearch import Elasticsearch
from elastipy import Search

# Use an explicit Elasticsearch client (or compatible class)
client = Elasticsearch(
    hosts=[{"host": "localhost", "port": 9200}],
    http_auth=("user", "pwd")
)

# create a Search using the specified client
s = Search(index="bla", client=client)

# can also be done later
s = s.client(client)

Check the Elasticsearch API reference for all the parameters.

We can also set a default client at the program start:

from elastipy import connections

connections.set("default", client)

# .. or as parameters, they get converted to an Elasticsearch client
connections.set("default", {"hosts": [{"host": "localhost", "port": 9200}]})

# get a client
connections.get("default")
<Elasticsearch([{'host': 'localhost', 'port': 9200}])>

Different connections can be specified with the alias name:

connections.set("special", {"hosts": [{"host": "special", "port": 1234}]})

s = Search(client="special")
s.get_client()
<Elasticsearch([{'host': 'special', 'port': 1234}])>

aggregations

More details can be found in the tutorial.

# get a search object
s = Search(index="world")

# create an Aggregation class connected to the Search
agg = s.agg_date_histogram(calendar_interval="1w")
# (for date-specific aggregations we can leave out the 'field' parameter
#  it falls back to Search.timestamp_field which is "timestamp" by default)

# submit the whole request
s.execute()

# access the response

list(agg.keys())
['1999-12-27T00:00:00.000Z',
 '2000-01-03T00:00:00.000Z',
 '2000-01-10T00:00:00.000Z',
 '2000-01-17T00:00:00.000Z']
list(agg.values())
[21, 77, 60, 42]

Without a metric these numbers are the document counts.

Above example as a one-liner:

Search(index="world").agg_date_histogram(calendar_interval="1w").execute().to_dict()
{'1999-12-27T00:00:00.000Z': 21,
 '2000-01-03T00:00:00.000Z': 77,
 '2000-01-10T00:00:00.000Z': 60,
 '2000-01-17T00:00:00.000Z': 42}

nested aggregations and metrics

s = Search(index="world")

# the first parameter is the name of the aggregation
#   (if omitted it will be "a0", "a1", aso..)
agg = s \
    .agg_terms("occasion", field="occasion") \
    .agg_rare_terms("rare-excuses", field="excuse", max_doc_count=2) \
    .metric_avg("avg-length", field="conversation_length") \
    .metric_max("max-length", field="conversation_length") \
    .execute()

The rare_terms aggregation is nested into the terms aggregation and the metrics are siblings nested inside rare_terms.

keys(), values(), items() and to_dict() all operate on the current aggregation. For bucket aggregations they typically show the doc_count value.’

agg.to_dict()
{('dinner', 'my mouth is too dry'): 1,
 ('dinner', "i can't reach the spoon"): 2}

The rows(), dict_rows() and dump.table() methods operate on the whole aggregation branch:

list(agg.dict_rows())
[{'occasion': 'dinner',
  'occasion.doc_count': 200,
  'rare-excuses': 'my mouth is too dry',
  'rare-excuses.doc_count': 1,
  'avg-length': 163.0,
  'max-length': 163.0},
 {'occasion': 'dinner',
  'occasion.doc_count': 200,
  'rare-excuses': "i can't reach the spoon",
  'rare-excuses.doc_count': 2,
  'avg-length': 109.5,
  'max-length': 133.0}]
agg.dump.table(colors=False)
occasion │ occasion.doc_count │ rare-excuses            │ rare-excuses.doc_count │ avg-length   │ max-length
─────────┼────────────────────┼─────────────────────────┼────────────────────────┼──────────────┼─────────────
dinner   │ 200                │ my mouth is too dry     │ 1 ██████████▌          │ 163.0 ██████ │ 163.0 ██████
dinner   │ 200                │ i can't reach the spoon │ 2 ████████████████████ │ 109.5 ████   │ 133.0 ████▉

queries

from elastipy import query

s = Search(index="prog-world")

# chaining means AND
s = s \
    .term(field="category", value="programming") \
    .term("usage", "widely-used")

# also can use operators
s = s & (
    query.Term("topic", "yet-another-api")
    | query.Term("topic", "yet-another-operator-overload")
)

# .query() replaces the current query
s = s.query(query.MatchAll())

languages_per_country = s.agg_terms(field="country").agg_terms(field="language").execute()

languages_per_country.to_dict()
{('IT', 'PHP'): 28,
 ('IT', 'Python'): 24,
 ('IT', 'C++'): 21,
 ('ES', 'C++'): 29,
 ('ES', 'Python'): 22,
 ('ES', 'PHP'): 18,
 ('US', 'PHP'): 23,
 ('US', 'Python'): 20,
 ('US', 'C++'): 15}

exporting

There is a small helper to export stuff to elasticsearch.

from elastipy import Exporter

class MyExporter(Exporter):
    INDEX_NAME = "my-index"

    # mapping can be defined here
    # it will be sent to elasticsearch before the first document is exported
    MAPPINGS = {
        "properties": {
            "some_field": {"type": "text"},
        }
    }

count, errors = MyExporter().export_list(a_lot_of_objects)

print(f"expored {count} objects, errors: {errors}")
expored 1000 objects, errors: []

It uses bulk requests and is very fast, supports document transformation and control over id and sub-index of documents.

import datetime

class MyExporter(Exporter):
    INDEX_NAME = "my-index-*"
    MAPPINGS = {
        "properties": {
            "some_field": {"type": "text"},
            "group": {"type": "keyword"},
            "id": {"type": "keyword"},
            "timestamp": {"type": "date"},
        }
    }

    # if each document has a unique id value we can use it
    # as the elasticsearch id as well. That way we do not
    # create documents twice when exporting them again.
    # Their data just gets updated.
    def get_document_id(self, es_data):
        return es_data["id"]

    # we can bucket documents into separate indices
    def get_document_index(self, es_data):
        return self.index_name().replace("*", es_data["group"])

    # here we can adjust or add some data before it gets exported.
    # it's also possible to split the data into several documents
    #   by yielding or returning a list
    def transform_document(self, data):
        data = data.copy()
        data["timestamp"] = datetime.datetime.now()
        return data

MyExporter().export_list(a_lot_of_objects)
(1000, [])

If we are tired enough we can call:

MyExporter().delete_index()
True

This will actually delete all sub-indices because there’s this wildcard * in the INDEX_NAME.

don’t be plastic, elastipy!

Hi there, this tutorial is actually a jupyter notebook and can be found in examples/tutorial.ipynb

exporting some objects

Without too much thinking we can just use the built-in export helper and generate some data.

from elastipy import Exporter

class ShapeExporter(Exporter):
    INDEX_NAME = "elastipy-example-shapes"
    MAPPINGS = {
        "properties": {
            "shape": {"type": "keyword"},
            "color": {"type": "keyword"},
            "area": {"type": "float"},
        }
    }

The INDEX_NAME is obviously the name of the elasticsearch index. The MAPPINGS parameter describes the elasticsearch mapping. Here we say that documents will at least have these common fields, one of type float and two of type keyword which means they are strings but not full-text searchable ones. Instead they are efficiently indexed and aggregatable.

The data we create out of thin air..

import random

def shape_generator(count=1000, seed=42):
    rnd = random.Random(seed)
    for i in range(count):
        yield {
            "shape": rnd.choice(("triangle", "square")),
            "color": rnd.choice(("red", "green", "blue")),
            "area": rnd.gauss(5, 1.3),
        }

Now create our exporter and export a couple of documents. It uses the bulk helper tools internally.

exporter = ShapeExporter()

count, errors = exporter.export_list(shape_generator(), refresh=True)

print(count, "exported")
1000 exported

The refresh=True parameter will refresh the index as soon as everything is exported, so we do not have to wait for objects to appear in the elasticsearch index.

query oh elastipyia

In most cases this import is enough to access all the good stuff:

from elastipy import Search, query

Now get some documents:

s = Search(index="elastipy-example-shapes")

s is now a search request that can be configured. Setting any search related options will always return a new instance. Here we set the maximum number of documents to respond:

s = s.size(3)

Next we add a query, more specifically a term query.

s = s.term(field="color", value="green")

Our request to elasticsearch would look like this right now:

s.dump.body()
{
  "query": {
    "term": {
      "color": {
        "value": "green"
      }
    }
  },
  "size": 3
}

More queries can be added, which defaults to an AND combination:

s = s.range(field="area", gt=5.)
s.dump.body()
{
  "query": {
    "bool": {
      "must": [
        {
          "term": {
            "color": {
              "value": "green"
            }
          }
        },
        {
          "range": {
            "area": {
              "gt": 5.0
            }
          }
        }
      ]
    }
  },
  "size": 3
}

OR combinations can be archived with the bool query itself or by applying the | operator to the query classes in elastipy.query:

s = s | (query.Term(field="color", value="red") & query.Range(field="area", gt=8.))
s.dump.body()
{
  "query": {
    "bool": {
      "should": [
        {
          "bool": {
            "must": [
              {
                "term": {
                  "color": {
                    "value": "green"
                  }
                }
              },
              {
                "range": {
                  "area": {
                    "gt": 5.0
                  }
                }
              }
            ]
          }
        },
        {
          "bool": {
            "must": [
              {
                "term": {
                  "color": {
                    "value": "red"
                  }
                }
              },
              {
                "range": {
                  "area": {
                    "gt": 8.0
                  }
                }
              }
            ]
          }
        }
      ]
    }
  },
  "size": 3
}

Better execute the search now before the body get’s too complicated:

response = s.execute()
response.dump()
{
  "took": 8,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 185,
      "relation": "eq"
    },
    "max_score": 2.1868048,
    "hits": [
      {
        "_index": "elastipy-example-shapes",
        "_type": "_doc",
        "_id": "1Lf0jHcBB26LJVfaIvox",
        "_score": 2.1868048,
        "_source": {
          "shape": "square",
          "color": "red",
          "area": 9.422362274394294
        }
      },
      {
        "_index": "elastipy-example-shapes",
        "_type": "_doc",
        "_id": "FLf0jHcBB26LJVfaIvsx",
        "_score": 2.1868048,
        "_source": {
          "shape": "triangle",
          "color": "red",
          "area": 8.011022752102972
        }
      },
      {
        "_index": "elastipy-example-shapes",
        "_type": "_doc",
        "_id": "OLf0jHcBB26LJVfaIvsx",
        "_score": 2.1868048,
        "_source": {
          "shape": "square",
          "color": "red",
          "area": 8.001834685241512
        }
      }
    ]
  }
}

The response object is a small wrapper around dict that has some convenience properties.

response.documents
[{'shape': 'square', 'color': 'red', 'area': 9.422362274394294},
 {'shape': 'triangle', 'color': 'red', 'area': 8.011022752102972},
 {'shape': 'square', 'color': 'red', 'area': 8.001834685241512}]

How many documents are there at all?

Search(index="elastipy-example-shapes").execute().total_hits
1000

The functions and properties are tried to make chainable in a way that allows for short and powerful oneliners:

Search(index="elastipy-example-shapes") \
    .size(20).sort("-area").execute().documents
[{'shape': 'triangle', 'color': 'red', 'area': 10.609408732815844},
 {'shape': 'square', 'color': 'blue', 'area': 9.785991184126697},
 {'shape': 'square', 'color': 'red', 'area': 9.422362274394294},
 {'shape': 'triangle', 'color': 'blue', 'area': 9.24591971667655},
 {'shape': 'square', 'color': 'blue', 'area': 9.11442473191995},
 {'shape': 'square', 'color': 'green', 'area': 8.928816107277179},
 {'shape': 'square', 'color': 'blue', 'area': 8.473742067630953},
 {'shape': 'triangle', 'color': 'green', 'area': 8.128635913090859},
 {'shape': 'triangle', 'color': 'blue', 'area': 8.033908240900079},
 {'shape': 'square', 'color': 'green', 'area': 8.030514737232895},
 {'shape': 'triangle', 'color': 'red', 'area': 8.011022752102972},
 {'shape': 'square', 'color': 'red', 'area': 8.001834685241512},
 {'shape': 'square', 'color': 'green', 'area': 7.986094071679083},
 {'shape': 'square', 'color': 'blue', 'area': 7.984604837392737},
 {'shape': 'triangle', 'color': 'blue', 'area': 7.965890845028483},
 {'shape': 'square', 'color': 'green', 'area': 7.937110248587943},
 {'shape': 'square', 'color': 'blue', 'area': 7.933212484940288},
 {'shape': 'triangle', 'color': 'blue', 'area': 7.900062931477738},
 {'shape': 'square', 'color': 'green', 'area': 7.892344075058484},
 {'shape': 'triangle', 'color': 'blue', 'area': 7.883278182699227}]

So that was rambling about the filtering and the documents in the response. There is a lot of functionality in elasticsearch that is not covered by this library right now. To move on in happiness we just start the next chapter.

agitated aggregation

Aggregations can be created using the agg_, metric_ and pipeline_ prefixes. An aggregation is attached to the Search instance, so there is no copying like with the queries above.

s = Search(index="elastipy-example-shapes").size(0)

agg = s.agg_terms(field="shape")

s.dump.body()
{
  "aggregations": {
    "a0": {
      "terms": {
        "field": "shape"
      }
    }
  },
  "query": {
    "match_all": {}
  },
  "size": 0
}

As we can see, a terms aggregation has been added to the search body. The names of aggregations are auto-generated, but can be explicitly stated:

s = Search(index="elastipy-example-shapes").size(0)

agg = s.agg_terms("shapes", field="shape")

s.dump.body()
{
  "aggregations": {
    "shapes": {
      "terms": {
        "field": "shape"
      }
    }
  },
  "query": {
    "match_all": {}
  },
  "size": 0
}

Let’s look at the result from elasticsearch:

s.execute().dump()
{
  "took": 2,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 1000,
      "relation": "eq"
    },
    "max_score": null,
    "hits": []
  },
  "aggregations": {
    "shapes": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "square",
          "doc_count": 513
        },
        {
          "key": "triangle",
          "doc_count": 487
        }
      ]
    }
  }
}

valuable access

Because we kept the agg variable, we can use it’s interface to access the values more conveniently:

agg.to_dict()
{'square': 513, 'triangle': 487}

It supports the items(), keys() and values() generators as known from the dict type:

for key, value in agg.items():
    print(f"{key:12} {value}")
square       513
triangle     487

It also has a dict_rows() generator which preserves the names and keys of the aggregations:

for row in agg.dict_rows():
    print(row)
{'shapes': 'square', 'shapes.doc_count': 513}
{'shapes': 'triangle', 'shapes.doc_count': 487}

The rows() generator flattens the dict_rows() into a CSV-style list:

for row in agg.rows():
    print(row)
['shapes', 'shapes.doc_count']
['square', 513]
['triangle', 487]

And we can print a nice table to the command-line:

agg.dump.table(colors=False)
shapes   │ shapes.doc_count
─────────┼────────────────────────────────────────────
square   │ 513 ███████████████████████████████████████
triangle │ 487 █████████████████████████████████████

(The colors=False parameter disables console colors because they do not work in this documentation)


Obviously, at this point a couple of users would not understand why there is no conversion to a pandas DataFrame built in:

agg.to_pandas()  # or simply agg.df()
shapes shapes.doc_count
0 square 513
1 triangle 487

The columns are assigned automatically.

Columns containing ISO-formatted date strings will be converted to pandas.Timestamp.

The DataFrame index column can be assigned with the index and to_index parameters.

index simply copies the column:

agg.to_pandas(index="shapes")
shapes shapes.doc_count
shapes
square square 513
triangle triangle 487

to_index will move the column:

agg.to_pandas(to_index="shapes")
shapes.doc_count
shapes
square 513
triangle 487

With matplotlib installed we can access the pandas plotting interface:

agg.df(to_index="shapes").plot.bar()
_images/tutorial_61_1.png

Satisfied with a little graphic we feel more confident and look into the details of metrics and nested bucket aggregations.

deeper aggregation agitation

agg = Search(index="elastipy-example-shapes") \
    .agg_terms("shapes", field="shape") \
    .agg_terms("colors", field="color") \
    .metric_sum("area", field="area") \
    .metric_avg("area-avg", field="area") \
    .execute()

A few notes:

  • agg_ methods always return the newly created aggregation, so the colors aggregation is nested inside the shapes aggregation.

  • metric_ methods return their parent aggregation (because metrics do not allow a nested aggregation), so we can just continue to call metric_* and each time we add a metric to the colors aggregation. If you need to get access to the metric object itself add the return_self=True parameter.

  • The execute method on an aggregation does not return the response but the aggregation itself.

Now, what does the to_dict output look like?

agg.to_dict()
{('square', 'blue'): 178,
 ('square', 'green'): 170,
 ('square', 'red'): 165,
 ('triangle', 'blue'): 177,
 ('triangle', 'green'): 170,
 ('triangle', 'red'): 140}

It has put the keys that lead to each value into tuples. Without a lot of thinking we can say:

data = agg.to_dict()
print(f"There are {data[('triangle', 'red')]} red triangles in the database!")
There are 140 red triangles in the database!

But where are the metrics gone?

Generally, keys(), values(), items(), to_dict() and to_matrix() only access the values of the current aggregation (which is colors in the example). Although all the keys of the parent bucket aggregations that lead to the values are included.

The methods dict_rows(), rows(), to_pandas() and .dump.table() will access all values from the whole aggregation branch. In this example the branch looks like this:

  • shapes

  • colors

    • area

    • area-avg

agg.dump.table(digits=3, colors=False)
shapes   │ shapes.doc_count    │ colors │ colors.doc_count    │ area                    │ area-avg
─────────┼─────────────────────┼────────┼─────────────────────┼─────────────────────────┼─────────────────────
square   │ 513 ███████████████ │ blue   │ 178 ███████████████ │ 920.602 ███████████████ │ 5.172 █████████████▉
square   │ 513 ███████████████ │ green  │ 170 ██████████████▍ │ 882.643 ██████████████▍ │ 5.192 ██████████████
square   │ 513 ███████████████ │ red    │ 165 █████████████▉  │ 848.229 █████████████▉  │ 5.141 █████████████▉
triangle │ 487 ██████████████▍ │ blue   │ 177 ██████████████▉ │ 891.198 ██████████████▌ │ 5.035 █████████████▋
triangle │ 487 ██████████████▍ │ green  │ 170 ██████████████▍ │ 834.699 █████████████▋  │  4.91 █████████████▍
triangle │ 487 ██████████████▍ │ red    │ 140 ███████████▉    │ 704.332 ███████████▋    │ 5.031 █████████████▋

Now all information is in the table. Note that the shapes.doc_count column contains the same value multiple times. This is because each colors aggregation bucket splits the shapes bucket into multiple results, without changing the overall count of the shapes, of course.

It’s possible to move the keys of sub-aggregations into new columns with the flat parameter. Below we basically say: Drop the colors and colors.doc_count columns and instead create a column for each encountered color key. The names of following sub-aggregations and metrics are appended to each key. (Also the resulting area-avg columns are excluded to not hurt our eyes too much)

agg.dump.table(flat="colors", exclude="*avg", digits=3, colors=False)
shapes   │ shapes.doc_count │ blue       │ blue.area      │ green │ green.area      │ red      │ red.area
─────────┼──────────────────┼────────────┼────────────────┼───────┼─────────────────┼──────────┼──────────────
square   │ 513 ████████████ │ 178 ██████ │ 920.602 ██████ │ 170   │ 882.643 ███████ │ 165 ████ │ 848.229 █████
triangle │ 487 ███████████▌ │ 177 █████▉ │ 891.198 █████▊ │ 170   │ 834.699 ██████▋ │ 140 ███▌ │ 704.332 ████▎

This can be useful for stacking bars in a plot:

agg.df(flat="colors", exclude=("*doc_count", "*area*")).plot.bar(stacked=True)
_images/tutorial_74_1.png

Now what is this method with the awesome name to_matrix?

names, keys, matrix = agg.to_matrix()
print("names ", names)
print("keys  ", keys)
print("matrix", matrix)
names  ['shapes', 'colors']
keys   [['square', 'triangle'], ['blue', 'green', 'red']]
matrix [[178, 170, 165], [177, 170, 140]]

It produces a heatmap! At least in two dimensions. In this example we have two dimensions from the bucket aggregations shapes and colors. to_matrix() will produce a matrix with any number of dimensions, but if it’s one or two, we can also convert it to a DataFrame:

agg.df_matrix()
blue green red
square 178 170 165
triangle 177 170 140

To access the values of metrics we have to call to_matrix on a metric aggregation. Our agg parameter contains the area and area-avg metrics and we can reach it with the children property. Below is the heatmap of the average area. Except for the values nothing changed because metrics (and pipelines) do not contribute to the keys.

agg.children[1].df_matrix()
blue green red
square 5.171923 5.192018 5.140783
triangle 5.035015 4.909992 5.030943

Having something like seaborn installed we can easily plot it:

import seaborn as sns

sns.heatmap(agg.df_matrix())
_images/tutorial_82_1.png

Aggregation

Aggregations can be created on the Search object or inside an existing Aggregation.

from elastipy import Search

s = Search()
agg = s.agg_terms("name_of_agg", field="field", size=100)

value access

Aggregation.keys(key_separator: Optional[str] = None, tuple_key: bool = False)

Iterates through all keys of this aggregation.

For example, a top-level terms aggregation would return all bucketed field values.

For a nested bucket aggregation each key is a tuple of all parent keys as well.

Parameters
  • key_separatorstr Optional separator to concat multiple keys into one string

  • tuple_keybool If True, the key is always a tuple If False, the key is a string if there is only one key

Returns

generator

Aggregation.values(default=None)

Iterates through all values of this aggregation.

Parameters

default – If not None any None-value will be replaced by this.

Returns

generator

Aggregation.items(key_separator: Optional[str] = None, tuple_key: bool = False, default=None)Iterable[Tuple]

Iterates through all key, value tuples.

Parameters
  • key_separatorstr Optional separator to concat multiple keys into one string.

  • tuple_key

    bool If True, the key is always a tuple.

    If False, the key is a string if there is only one key.

  • default – If not None any None-value will be replaced by this.

Returns

generator

Aggregation.rows(header: bool = True, include: Optional[Union[str, Sequence[str]]] = None, exclude: Optional[Union[str, Sequence[str]]] = None, flat: Union[bool, str, Sequence[str]] = False, default=None)Iterable[list]

Iterates through all result values from this aggregation branch.

Each row is a list. The first row contains the names if ‘header’ == True.

This will include all parent aggregations (up to the root) and all children aggregations (including metrics).

Parameters
  • headerbool If True, the first row contains the names of the columns

  • includestr or sequence of str Can be one or more (OR-combined) wildcard patterns. If used, any column that does not fit a pattern is removed.

  • excludestr or sequence of str Can be one or more (OR-combined) wildcard patterns. If used, any column that fits a pattern is removed.

  • flat

    bool, str or sequence of str Can be one or more aggregation names that should be flattened out, meaning that each key of the aggregation creates a new column instead of a new row. If True, all bucket aggregations are flattened.

    Only supported for bucket aggregations!

    Note

    Currently not supported for the root aggregation!

  • default – This value will be used wherever a value is undefined.

Returns

generator of list

Aggregation.dict_rows(include: Optional[Union[str, Sequence[str]]] = None, exclude: Optional[Union[str, Sequence[str]]] = None, flat: Union[bool, str, Sequence[str]] = False)Iterable[dict]

Iterates through all result values from this aggregation branch.

This will include all parent aggregations (up to the root) and all children aggregations (including metrics and pipelines).

Parameters
  • includestr or sequence of str Can be one or more (OR-combined) wildcard patterns. If used, any column that does not fit a pattern is removed.

  • excludestr or sequence of str Can be one or more (OR-combined) wildcard patterns. If used, any column that fits a pattern is removed.

  • flat

    bool, str or sequence of str Can be one or more aggregation names that should be flattened out, meaning that each key of the aggregation creates a new column instead of a new row. If True, all bucket aggregations are flattened.

    Only supported for bucket aggregations!

    Note

    Currently not supported for the root aggregation!

Returns

generator of dict

Aggregation.to_dict(key_separator=None, default=None)dict

Create a dictionary from all key/value pairs.

Parameters
  • key_separator – str, optional separator to concat multiple keys into one string

  • default – If not None any None-value will be replaced by this.

Returns

dict

Aggregation.to_pandas(index: Union[bool, str] = False, to_index: Union[bool, str] = False, include: Optional[Union[str, Sequence[str]]] = None, exclude: Optional[Union[str, Sequence[str]]] = None, flat: Union[bool, str, Sequence[str]] = False, dtype=None, default=None)

Converts the results of dict_rows() to a pandas DataFrame.

This will include all parent aggregations (up to the root) and all children aggregations (including metrics).

Any columns containing dates will be automatically converted to pandas.Timestamp.

This method has a synonym: df

Parameters
  • index

    bool or str Sets a specific column as the index of the DataFrame.

    • If False no explicit index is set.

    • If True the root aggregation’s keys will be the index.

    • if str explicitly set a certain column as the DataFrame index.

    Note

    The column is kept in the DataFrame. If you wan’t to set a column as index and remove it from the columns, use to_index.

  • to_index

    bool or str Same as index but the column is removed from DataFrame.

    • If False no explicit index is set.

    • If True the root aggregation’s keys will be the index.

    • if str explicitly set a certain column as the DataFrame index.

  • includestr or list of str Can be one or more (OR-combined) wildcard patterns. If used, any column that does not fit a pattern is removed

  • excludestr or list of str Can be one or more (OR-combined) wildcard patterns. If used, any column that fits a pattern is removed

  • flat

    bool, str or sequence of str Can be one or more aggregation names that should be flattened out, meaning that each key of the aggregation creates a new column instead of a new row. If True, all bucket aggregations are flattened.

    Only supported for bucket aggregations!

    Note

    Currently not supported for the root aggregation!

  • dtype – Numpy data type to force. Only a single dtype is allowed. If None, infer.

  • default – This value will be used wherever a value is undefined.

Returns

pandas DataFrame instance

Aggregation.to_matrix(sort: Optional[Union[bool, str, int, Sequence[Union[str, int]]]] = None, default: Optional[Any] = None, include: Optional[Union[str, Sequence[str]]] = None, exclude: Optional[Union[str, Sequence[str]]] = None)Tuple[List[str], List, List]

Generate an N-dimensional matrix from the values of this aggregation.

Each dimension corresponds to one of the parent bucket keys that lead to this aggregation.

The values are gathered through the Aggregation.items method. So the matrix values are either the doc_count of the bucket aggregation or the result of a metric or pipeline aggregation that is inside one of the bucket aggregations.

a = Search().agg_terms("color", field="color")
a = a.agg_terms("shape", field="shape")
...
names, keys, matrix = a.to_matrix()
names == ["color", "shape"]
keys == [["red", "green", "blue"], ["circle", "triangle"]]
matrix == [[23, 42], [84, 69], [4, 10]]
Parameters
  • sort

    Can sort one or several keys/axises.

    • True sorts all keys ascending

    • "-" sorts all keys descending

    • The name of an aggregation sorts it’s keys ascending. A “-” prefix sorts descending.

    • An integer defines the aggregation by index. Negative integers sort descending.

    • A sequence of strings or integers can sort multiple keys

    For example, agg.to_matrix(sort=(“color”, “-shape”, -4)) would sort the color keys ascending, the shape keys descending and the 4th aggregation -whatever that is- descending.

  • default – If not None any None-value will be replaced by this value

  • includestr | seq[str] One or more wildcard patterns that include matching keys. All other keys are removed from the output.

  • excludestr | seq[str] One or more wildcard patterns that exclude matching keys.

Returns

A tuple of names, keys and matrix data, each as list.

The names are the names of each aggregation that generates keys.

The keys are a list of lists, each corresponding to all the keys of each parent aggregation.

Data is a list, with other nested lists for each further dimension, containing the values of this aggregation.

Returns three empty lists if no data is available.

aggregation interface

The Search class as well as created aggregations themselves support the following interface.

class elastipy.aggregation.Aggregation(search, name, type, params)[source]

Bases: elastipy.aggregation.converter.ConverterMixin, elastipy.aggregation.generated_interface.AggregationInterface

Aggregation definition and response parser.

Do not create instances yourself, use the Search.aggregation() and Aggregation.aggregation() variants.

Once the Search has been executed, the values of the aggregations can be accessed.

agg(*aggregation_name_type, **params)

Alias for aggregation()

agg_adjacency_matrix(*aggregation_name: Optional[str], filters: Mapping[str, Union[Mapping, QueryInterface]], separator: Optional[str] = None)

A bucket aggregation returning a form of adjacency matrix. The request provides a collection of named filter expressions, similar to the filters aggregation request. Each bucket in the response represents a non-empty cell in the matrix of intersecting filters.

The matrix is said to be symmetric so we only return half of it. To do this we sort the filter name strings and always use the lowest of a pair as the value to the left of the "&" separator.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • filtersMapping[str, Union[Mapping, 'QueryInterface']]

  • separatorOptional[str] An alternative separator parameter can be passed in the request if clients wish to use a separator string other than the default of the ampersand.

Returns

'AggregationInterface' A new instance is created and returned

agg_auto_date_histogram(*aggregation_name: Optional[str], field: Optional[str] = None, buckets: int = 10, minimum_interval: Optional[str] = None, time_zone: Optional[str] = None, format: Optional[str] = None, keyed: bool = False, missing: Optional[Any] = None, script: Optional[dict] = None)

A multi-bucket aggregation similar to the Date histogram except instead of providing an interval to use as the width of each bucket, a target number of buckets is provided indicating the number of buckets needed and the interval of the buckets is automatically chosen to best achieve that target. The number of buckets returned will always be less than or equal to this target number.

The buckets field is optional, and will default to 10 buckets if not specified.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldOptional[str] If no field is specified it will default to the ‘timestamp_field’ of the Search class.

  • bucketsint The number of buckets that are to be returned.

  • minimum_interval

    Optional[str] The minimum_interval allows the caller to specify the minimum rounding interval that should be used. This can make the collection process more efficient, as the aggregation will not attempt to round at any interval lower than minimum_interval.

    The accepted units for minimum_interval are: year, month, day, hour, minute, second

  • time_zone

    Optional[str] Date-times are stored in Elasticsearch in UTC. By default, all bucketing and rounding is also done in UTC. The time_zone parameter can be used to indicate that bucketing should use a different time zone.

    Time zones may either be specified as an ISO 8601 UTC offset (e.g. +01:00 or -08:00) or as a timezone id, an identifier used in the TZ database like America/Los_Angeles.

    Warning

    When using time zones that follow DST (daylight savings time) changes, buckets close to the moment when those changes happen can have slightly different sizes than neighbouring buckets. For example, consider a DST start in the CET time zone: on 27 March 2016 at 2am, clocks were turned forward 1 hour to 3am local time. If the result of the aggregation was daily buckets, the bucket covering that day will only hold data for 23 hours instead of the usual 24 hours for other buckets. The same is true for shorter intervals like e.g. 12h. Here, we will have only a 11h bucket on the morning of 27 March when the DST shift happens.

  • formatOptional[str] Specifies the format of the ‘key_as_string’ response. See: mapping date format

  • keyedbool Setting the keyed flag to true associates a unique string key with each bucket and returns the ranges as a hash rather than an array.

  • missingOptional[Any] The missing parameter defines how documents that are missing a value should be treated. By default they will be ignored but it is also possible to treat them as if they had a value.

  • scriptOptional[dict] Generating the terms using a script

Returns

'AggregationInterface' A new instance is created and returned

agg_children(*aggregation_name: Optional[str], type: str)

A special single bucket aggregation that selects child documents that have the specified type, as defined in a join field.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • typestr The child type that should be selected.

Returns

'AggregationInterface' A new instance is created and returned

agg_composite(*aggregation_name: Optional[str], sources: Sequence[Mapping], size: int = 10, after: Optional[Union[str, int, float, datetime.datetime]] = None)

A multi-bucket aggregation that creates composite buckets from different sources.

Unlike the other multi-bucket aggregations, you can use the composite aggregation to paginate all buckets from a multi-level aggregation efficiently. This aggregation provides a way to stream all buckets of a specific aggregation, similar to what scroll does for documents.

The composite buckets are built from the combinations of the values extracted/created for each document and each combination is considered as a composite bucket.

For optimal performance the index sort should be set on the index so that it matches parts or fully the source order in the composite aggregation.

Sub-buckets: Like any multi-bucket aggregations the composite aggregation can hold sub-aggregations. These sub-aggregations can be used to compute other buckets or statistics on each composite bucket created by this parent aggregation.

Pipeline aggregations: The composite agg is not currently compatible with pipeline aggregations, nor does it make sense in most cases. E.g. due to the paging nature of composite aggs, a single logical partition (one day for example) might be spread over multiple pages. Since pipeline aggregations are purely post-processing on the final list of buckets, running something like a derivative on a composite page could lead to inaccurate results as it is only taking into account a “partial” result on that page.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • sources

    Sequence[Mapping] The sources parameter defines the source fields to use when building composite buckets. The order that the sources are defined controls the order that the keys are returned.

    The sources parameter can be any of the following types:

    • Terms

    • Histogram

    • Date histogram

    • GeoTile grid

    Note

    You must use a unique name when defining sources.

  • size

    int The size parameter can be set to define how many composite buckets should be returned. Each composite bucket is considered as a single bucket, so setting a size of 10 will return the first 10 composite buckets created from the value sources. The response contains the values for each composite bucket in an array containing the values extracted from each value source.

    Pagination: If the number of composite buckets is too high (or unknown) to be returned in a single response it is possible to split the retrieval in multiple requests. Since the composite buckets are flat by nature, the requested size is exactly the number of composite buckets that will be returned in the response (assuming that they are at least size composite buckets to return). If all composite buckets should be retrieved it is preferable to use a small size (100 or 1000 for instance) and then use the after parameter to retrieve the next results.

  • after

    Optional[Union[str, int, float, datetime]] To get the next set of buckets, resend the same aggregation with the after parameter set to the after_key value returned in the response.

    Note

    The after_key is usually the key to the last bucket returned in the response, but that isn’t guaranteed. Always use the returned after_key instead of derriving it from the buckets.

    In order to optimize the early termination it is advised to set track_total_hits in the request to false. The number of total hits that match the request can be retrieved on the first request and it would be costly to compute this number on every page.

Returns

'AggregationInterface' A new instance is created and returned

agg_date_histogram(*aggregation_name: Optional[str], field: Optional[str] = None, calendar_interval: Optional[str] = None, fixed_interval: Optional[str] = None, min_doc_count: int = 1, offset: Optional[str] = None, time_zone: Optional[str] = None, format: Optional[str] = None, keyed: bool = False, missing: Optional[Any] = None, script: Optional[dict] = None)

This multi-bucket aggregation is similar to the normal histogram, but it can only be used with date or date range values. Because dates are represented internally in Elasticsearch as long values, it is possible, but not as accurate, to use the normal histogram on dates as well. The main difference in the two APIs is that here the interval can be specified using date/time expressions. Time-based data requires special support because time-based intervals are not always a fixed length.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldOptional[str] If no field is specified it will default to the ‘timestamp_field’ of the Search class.

  • calendar_intervalOptional[str] Calendar-aware intervals are configured with the calendar_interval parameter. You can specify calendar intervals using the unit name, such as month, or as a single unit quantity, such as 1M. For example, day and 1d are equivalent. Multiple quantities, such as 2d, are not supported.

  • fixed_interval

    Optional[str] In contrast to calendar-aware intervals, fixed intervals are a fixed number of SI units and never deviate, regardless of where they fall on the calendar. One second is always composed of 1000ms. This allows fixed intervals to be specified in any multiple of the supported units.

    However, it means fixed intervals cannot express other units such as months, since the duration of a month is not a fixed quantity. Attempting to specify a calendar interval like month or quarter will throw an exception.

    The accepted units for fixed intervals are:

    • milliseconds (ms): A single millisecond. This is a very, very small interval.

    • seconds (s): Defined as 1000 milliseconds each.

    • minutes (m): Defined as 60 seconds each (60,000 milliseconds). All minutes begin at 00 seconds.

    • hours (h): Defined as 60 minutes each (3,600,000 milliseconds). All hours begin at 00 minutes and 00 seconds.

    • days (d): Defined as 24 hours (86,400,000 milliseconds). All days begin at the earliest possible time, which is usually 00:00:00 (midnight).

  • min_doc_countint Minimum documents required for a bucket. Set to 0 to allow creating empty buckets.

  • offset

    Optional[str] Use the offset parameter to change the start value of each bucket by the specified positive (+) or negative offset (-) duration, such as 1h for an hour, or 1d for a day. See Time units for more possible time duration options.

    For example, when using an interval of day, each bucket runs from midnight to midnight. Setting the offset parameter to +6h changes each bucket to run from 6am to 6am

  • time_zone

    Optional[str] Elasticsearch stores date-times in Coordinated Universal Time (UTC). By default, all bucketing and rounding is also done in UTC. Use the time_zone parameter to indicate that bucketing should use a different time zone.

    For example, if the interval is a calendar day and the time zone is America/New_York then 2020-01-03T01:00:01Z is

    • converted to 2020-01-02T18:00:01

    • rounded down to 2020-01-02T00:00:00

    • then converted back to UTC to produce 2020-01-02T05:00:00:00Z

    • finally, when the bucket is turned into a string key it is printed in America/New_York so it’ll display as "2020-01-02T00:00:00"

    It looks like:

    bucket_key = localToUtc(Math.floor(utcToLocal(value) / interval) * interval))

    You can specify time zones as an ISO 8601 UTC offset (e.g. +01:00 or -08:00) or as an IANA time zone ID, such as America/Los_Angeles.

  • formatOptional[str] Specifies the format of the ‘key_as_string’ response. See: mapping date format

  • keyedbool Setting the keyed flag to true associates a unique string key with each bucket and returns the ranges as a hash rather than an array.

  • missingOptional[Any] The missing parameter defines how documents that are missing a value should be treated. By default they will be ignored but it is also possible to treat them as if they had a value.

  • scriptOptional[dict] Generating the terms using a script

Returns

'AggregationInterface' A new instance is created and returned

agg_date_range(*aggregation_name: Optional[str], ranges: Sequence[Union[Mapping[str, str], str]], field: Optional[str] = None, format: Optional[str] = None, time_zone: Optional[str] = None, keyed: bool = False, missing: Optional[Any] = None, script: Optional[dict] = None)

A range aggregation that is dedicated for date values. The main difference between this aggregation and the normal range aggregation is that the from and to values can be expressed in Date Math expressions, and it is also possible to specify a date format by which the from and to response fields will be returned.

Note

Note that this aggregation includes the from value and excludes the to value for each range.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • ranges

    Sequence[Union[Mapping[str, str], str]] List of ranges to define the buckets

    Example:

    [
        {"to": "1970-01-01"},
        {"from": "1970-01-01", "to": "1980-01-01"},
        {"from": "1980-01-01"},
    ]
    

    Instead of date values any Date Math expression can be used as well.

    Alternatively this parameter can be a list of strings. The above example can be rewritten as: ["1970-01-01", "1980-01-01"]

    Note

    This aggregation includes the from value and excludes the to value for each range.

  • field

    Optional[str] The date field

    If no field is specified it will default to the ‘timestamp_field’ of the Search class.

  • formatOptional[str] The format of the response bucket keys as available for the DateTimeFormatter

  • time_zone

    Optional[str] Dates can be converted from another time zone to UTC by specifying the time_zone parameter.

    Time zones may either be specified as an ISO 8601 UTC offset (e.g. +01:00 or -08:00) or as one of the time zone ids from the TZ database.

    The time_zone parameter is also applied to rounding in date math expressions.

  • keyedbool Setting the keyed flag to true associates a unique string key with each bucket and returns the ranges as a hash rather than an array.

  • missingOptional[Any] The missing parameter defines how documents that are missing a value should be treated. By default they will be ignored but it is also possible to treat them as if they had a value.

  • scriptOptional[dict] Generating the terms using a script

Returns

'AggregationInterface' A new instance is created and returned

agg_diversified_sampler(*aggregation_name: Optional[str], field: Optional[str] = None, script: Optional[Mapping] = None, shard_size: int = 100, max_docs_per_value: int = 1)

Like the sampler aggregation this is a filtering aggregation used to limit any sub aggregations’ processing to a sample of the top-scoring documents. The diversified_sampler aggregation adds the ability to limit the number of matches that share a common value such as an “author”.

Note

Any good market researcher will tell you that when working with samples of data it is important that the sample represents a healthy variety of opinions rather than being skewed by any single voice. The same is true with aggregations and sampling with these diversify settings can offer a way to remove the bias in your content (an over-populated geography, a large spike in a timeline or an over-active forum spammer).

Example use cases:

  • Tightening the focus of analytics to high-relevance matches rather than the potentially very long tail of low-quality matches

  • Removing bias from analytics by ensuring fair representation of content from different sources

  • Reducing the running cost of aggregations that can produce useful results using only samples e.g. significant_terms

A choice of field or script setting is used to provide values used for de-duplication and the max_docs_per_value setting controls the maximum number of documents collected on any one shard which share a common value. The default setting for max_docs_per_value is 1.

Note

The aggregation will throw an error if the choice of field or script produces multiple values for a single document (de-duplication using multi-valued fields is not supported due to efficiency concerns).

Limitations:

Cannot be nested under breadth_first aggregations Being a quality-based filter the diversified_sampler aggregation needs access to the relevance score produced for each document. It therefore cannot be nested under a terms aggregation which has the collect_mode switched from the default depth_first mode to breadth_first as this discards scores. In this situation an error will be thrown.

Limited de-dup logic. The de-duplication logic applies only at a shard level so will not apply across shards.

No specialized syntax for geo/date fields Currently the syntax for defining the diversifying values is defined by a choice of field or script - there is no added syntactical sugar for expressing geo or date units such as "7d" (7 days). This support may be added in a later release and users will currently have to create these sorts of values using a script.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldOptional[str] The field to search on. Can alternatively be a script

  • scriptOptional[Mapping] The script that specifies the aggregation. Can alternatively be a ‘field’

  • shard_sizeint The shard_size parameter limits how many top-scoring documents are collected in the sample processed on each shard. The default value is 100.

  • max_docs_per_valueint The max_docs_per_value is an optional parameter and limits how many documents are permitted per choice of de-duplicating value. The default setting is 1.

Returns

'AggregationInterface' A new instance is created and returned

agg_filter(*aggregation_name: Optional[str], filter: Union[Mapping, QueryInterface])

Defines a single bucket of all the documents in the current document set context that match a specified filter. Often this will be used to narrow down the current aggregation context to a specific set of documents.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • filterUnion[Mapping, 'QueryInterface']

Returns

'AggregationInterface' A new instance is created and returned

agg_filters(*aggregation_name: Optional[str], filters: Mapping[str, Union[Mapping, QueryInterface]])

Defines a multi bucket aggregation where each bucket is associated with a filter. Each bucket will collect all documents that match its associated filter.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • filtersMapping[str, Union[Mapping, 'QueryInterface']]

Returns

'AggregationInterface' A new instance is created and returned

agg_geo_distance(*aggregation_name: Optional[str], field: str, ranges: Sequence[Union[Mapping[str, float], float]], origin: Union[str, Mapping[str, float], Sequence[float]], unit: str = 'm', distance_type: str = 'arc', keyed: bool = False)

A multi-bucket aggregation that works on geo_point fields and conceptually works very similar to the range aggregation. The user can define a point of origin and a set of distance range buckets. The aggregation evaluate the distance of each document value from the origin point and determines the buckets it belongs to based on the ranges (a document belongs to a bucket if the distance between the document and the origin falls within the distance range of the bucket).

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr The specified field must be of type geo_point (which can only be set explicitly in the mappings). And it can also hold an array of geo_point fields, in which case all will be taken into account during aggregation.

  • ranges

    Sequence[Union[Mapping[str, float], float]] A list of ranges that define the separate buckets, e.g:

    [ { "to": 100000 }, { "from": 100000, "to": 300000 }, { "from":
    300000 } ]
    

    Alternatively this parameter can be a list of numbers. The above example can be rewritten as [100000, 300000]

  • origin

    Union[str, Mapping[str, float], Sequence[float]] The origin point can accept all formats supported by the geo_point type:

    • Object format: { "lat" : 52.3760, "lon" : 4.894 } - this is the safest format as it is the most explicit about the lat & lon values

    • String format: "52.3760, 4.894" - where the first number is the lat and the second is the lon

    • Array format: [4.894, 52.3760] - which is based on the GeoJson standard and where the first number is the lon and the second one is the lat

  • unitstr By default, the distance unit is m (meters) but it can also accept: mi (miles), in (inches), yd (yards), km (kilometers), cm (centimeters), mm (millimeters).

  • distance_typestr There are two distance calculation modes: arc (the default), and plane. The arc calculation is the most accurate. The plane is the fastest but least accurate. Consider using plane when your search context is “narrow”, and spans smaller geographical areas (~5km). plane will return higher error margins for searches across very large areas (e.g. cross continent search).

  • keyedbool Setting the keyed flag to true will associate a unique string key with each bucket and return the ranges as a hash rather than an array.

Returns

'AggregationInterface' A new instance is created and returned

agg_geohash_grid(*aggregation_name: Optional[str], field: str, precision: Union[int, str] = 5, bounds: Optional[Mapping] = None, size: int = 10000, shard_size: Optional[int] = None)

A multi-bucket aggregation that works on geo_point fields and groups points into buckets that represent cells in a grid. The resulting grid can be sparse and only contains cells that have matching data. Each cell is labeled using a geohash which is of user-definable precision.

  • High precision geohashes have a long string length and represent cells that cover only a small area.

  • Low precision geohashes have a short string length and represent cells that each cover a large area.

Geohashes used in this aggregation can have a choice of precision between 1 and 12.

The highest-precision geohash of length 12 produces cells that cover less than a square metre of land and so high-precision requests can be very costly in terms of RAM and result sizes.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • field

    str The specified field must be of type geo_point or geo_shape (which can only be set explicitly in the mappings). And it can also hold an array of geo_point fields, in which case all will be taken into account during aggregation.

    Aggregating on Geo-shape fields works just as it does for points, except that a single shape can be counted for in multiple tiles. A shape will contribute to the count of matching values if any part of its shape intersects with that tile.

  • precision

    Union[int, str] The required precision of the grid in the range [1, 12]. Higher means more precise.

    Alternatively, the precision level can be approximated from a distance measure like "1km", "10m". The precision level is calculate such that cells will not exceed the specified size (diagonal) of the required precision. When this would lead to precision levels higher than the supported 12 levels, (e.g. for distances <5.6cm) the value is rejected.

    Note

    When requesting detailed buckets (typically for displaying a “zoomed in” map) a filter like geo_bounding_box should be applied to narrow the subject area otherwise potentially millions of buckets will be created and returned.

  • boundsOptional[Mapping] The geohash_grid aggregation supports an optional bounds parameter that restricts the points considered to those that fall within the bounds provided. The bounds parameter accepts the bounding box in all the same accepted formats of the bounds specified in the Geo Bounding Box Query. This bounding box can be used with or without an additional geo_bounding_box query filtering the points prior to aggregating. It is an independent bounding box that can intersect with, be equal to, or be disjoint to any additional geo_bounding_box queries defined in the context of the aggregation.

  • sizeint The maximum number of geohash buckets to return (defaults to 10,000). When results are trimmed, buckets are prioritised based on the volumes of documents they contain.

  • shard_sizeOptional[int] To allow for more accurate counting of the top cells returned in the final result the aggregation defaults to returning max(10, (size x number-of-shards)) buckets from each shard. If this heuristic is undesirable, the number considered from each shard can be over-ridden using this parameter.

Returns

'AggregationInterface' A new instance is created and returned

agg_geotile_grid(*aggregation_name: Optional[str], field: str, precision: Union[int, str] = 7, bounds: Optional[Mapping] = None, size: int = 10000, shard_size: Optional[int] = None)

A multi-bucket aggregation that works on geo_point fields and groups points into buckets that represent cells in a grid. The resulting grid can be sparse and only contains cells that have matching data. Each cell corresponds to a map tile as used by many online map sites. Each cell is labeled using a “{zoom}/{x}/{y}” format, where zoom is equal to the user-specified precision.

  • High precision keys have a larger range for x and y, and represent tiles that cover only a small area.

  • Low precision keys have a smaller range for x and y, and represent tiles that each cover a large area.

Warning

The highest-precision geotile of length 29 produces cells that cover less than a 10cm by 10cm of land and so high-precision requests can be very costly in terms of RAM and result sizes. Please first filter the aggregation to a smaller geographic area before requesting high-levels of detail.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr The specified field must be of type geo_point (which can only be set explicitly in the mappings). And it can also hold an array of geo_point fields, in which case all will be taken into account during aggregation.

  • precision

    Union[int, str] The required precision of the grid in the range [1, 29]. Higher means more precise.

    Note

    When requesting detailed buckets (typically for displaying a “zoomed in” map) a filter like geo_bounding_box should be applied to narrow the subject area otherwise potentially millions of buckets will be created and returned.

  • boundsOptional[Mapping] The geotile_grid aggregation supports an optional bounds parameter that restricts the points considered to those that fall within the bounds provided. The bounds parameter accepts the bounding box in all the same accepted formats of the bounds specified in the Geo Bounding Box Query. This bounding box can be used with or without an additional geo_bounding_box query filtering the points prior to aggregating. It is an independent bounding box that can intersect with, be equal to, or be disjoint to any additional geo_bounding_box queries defined in the context of the aggregation.

  • sizeint The maximum number of geohash buckets to return (defaults to 10,000). When results are trimmed, buckets are prioritised based on the volumes of documents they contain.

  • shard_sizeOptional[int] To allow for more accurate counting of the top cells returned in the final result the aggregation defaults to returning max(10, (size x number-of-shards)) buckets from each shard. If this heuristic is undesirable, the number considered from each shard can be over-ridden using this parameter.

Returns

'AggregationInterface' A new instance is created and returned

agg_global(*aggregation_name: Optional[str])

Defines a single bucket of all the documents within the search execution context. This context is defined by the indices and the document types you’re searching on, but is not influenced by the search query itself.

Note

Global aggregators can only be placed as top level aggregators because it doesn’t make sense to embed a global aggregator within another bucket aggregator.

elasticsearch documentation

Parameters

aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

Returns

'AggregationInterface' A new instance is created and returned

agg_histogram(*aggregation_name: Optional[str], field: str, interval: int, min_doc_count: int = 0, offset: Optional[int] = None, extended_bounds: Optional[Mapping[str, int]] = None, hard_bounds: Optional[Mapping[str, int]] = None, format: Optional[str] = None, order: Optional[Union[Mapping, str]] = None, keyed: bool = False, missing: Optional[Any] = None)

A multi-bucket values source based aggregation that can be applied on numeric values or numeric range values extracted from the documents. It dynamically builds fixed size (a.k.a. interval) buckets over the values. For example, if the documents have a field that holds a price (numeric), we can configure this aggregation to dynamically build buckets with interval 5 (in case of price it may represent $5). When the aggregation executes, the price field of every document will be evaluated and will be rounded down to its closest bucket - for example, if the price is 32 and the bucket size is 5 then the rounding will yield 30 and thus the document will “fall” into the bucket that is associated with the key 30. To make this more formal, here is the rounding function that is used:

bucket_key = Math.floor((value - offset) / interval) * interval + offset

For range values, a document can fall into multiple buckets. The first bucket is computed from the lower bound of the range in the same way as a bucket for a single value is computed. The final bucket is computed in the same way from the upper bound of the range, and the range is counted in all buckets in between and including those two.

The interval must be a positive decimal, while the offset must be a decimal in [0, interval) (a decimal greater than or equal to 0 and less than interval)

Histogram fields: Running a histogram aggregation over histogram fields computes the total number of counts for each interval. See example

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr A numeric field to be indexed by the histogram.

  • intervalint A positive decimal defining the interval between buckets.

  • min_doc_count

    int By default the response will fill gaps in the histogram with empty buckets. It is possible change that and request buckets with a higher minimum count thanks to the min_doc_count setting

    By default the histogram returns all the buckets within the range of the data itself, that is, the documents with the smallest values (on which with histogram) will determine the min bucket (the bucket with the smallest key) and the documents with the highest values will determine the max bucket (the bucket with the highest key). Often, when requesting empty buckets, this causes a confusion, specifically, when the data is also filtered.

    To understand why, let’s look at an example:

    Lets say the you’re filtering your request to get all docs with values between 0 and 500, in addition you’d like to slice the data per price using a histogram with an interval of 50. You also specify “min_doc_count” : 0 as you’d like to get all buckets even the empty ones. If it happens that all products (documents) have prices higher than 100, the first bucket you’ll get will be the one with 100 as its key. This is confusing, as many times, you’d also like to get those buckets between 0 - 100.

  • offset

    Optional[int] By default the bucket keys start with 0 and then continue in even spaced steps of interval, e.g. if the interval is 10, the first three buckets (assuming there is data inside them) will be [0, 10), [10, 20), [20, 30). The bucket boundaries can be shifted by using the offset option.

    This can be best illustrated with an example. If there are 10 documents with values ranging from 5 to 14, using interval 10 will result in two buckets with 5 documents each. If an additional offset 5 is used, there will be only one single bucket [5, 15) containing all the 10 documents.

  • extended_bounds

    Optional[Mapping[str, int]] With extended_bounds setting, you now can “force” the histogram aggregation to start building buckets on a specific min value and also keep on building buckets up to a max value (even if there are no documents anymore). Using extended_bounds only makes sense when min_doc_count is 0 (the empty buckets will never be returned if min_doc_count is greater than 0).

    Note that (as the name suggest) extended_bounds is not filtering buckets. Meaning, if the extended_bounds.min is higher than the values extracted from the documents, the documents will still dictate what the first bucket will be (and the same goes for the extended_bounds.max and the last bucket). For filtering buckets, one should nest the histogram aggregation under a range filter aggregation with the appropriate from/to settings.

    When aggregating ranges, buckets are based on the values of the returned documents. This means the response may include buckets outside of a query’s range. For example, if your query looks for values greater than 100, and you have a range covering 50 to 150, and an interval of 50, that document will land in 3 buckets - 50, 100, and 150. In general, it’s best to think of the query and aggregation steps as independent - the query selects a set of documents, and then the aggregation buckets those documents without regard to how they were selected. See note on bucketing range fields for more information and an example.

  • hard_boundsOptional[Mapping[str, int]] The hard_bounds is a counterpart of extended_bounds and can limit the range of buckets in the histogram. It is particularly useful in the case of open data ranges that can result in a very large number of buckets.

  • formatOptional[str] Specifies the format of the ‘key_as_string’ response. See: mapping date format

  • orderOptional[Union[Mapping, str]] By default the returned buckets are sorted by their key ascending, though the order behaviour can be controlled using the order setting. Supports the same order functionality as the Terms Aggregation.

  • keyedbool Setting the keyed flag to true associates a unique string key with each bucket and returns the ranges as a hash rather than an array.

  • missingOptional[Any] The missing parameter defines how documents that are missing a value should be treated. By default they will be ignored but it is also possible to treat them as if they had a value.

Returns

'AggregationInterface' A new instance is created and returned

agg_ip_range(*aggregation_name: Optional[str], field: str, ranges: Sequence[Union[Mapping[str, str], str]], keyed: bool = False)

Just like the dedicated date range aggregation, there is also a dedicated range aggregation for IP typed fields:

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr The IPv4 field

  • ranges

    Sequence[Union[Mapping[str, str], str]] List of ranges to define the buckets, either as straight IPv4 or as CIDR masks.

    Example:

    [
        {"to": "10.0.0.5"},
        {"from": "10.0.0.5", "to": "10.0.0.127"},
        {"from": "10.0.0.127"},
    ]
    

    Alternatively this parameter can be a list of strings. The above example can be rewritten as: ["10.0.0.5", "10.0.0.127"]

  • keyedbool Setting the keyed flag to true associates a unique string key with each bucket and returns the ranges as a hash rather than an array.

Returns

'AggregationInterface' A new instance is created and returned

agg_missing(*aggregation_name: Optional[str], field: str)

A field data based single bucket aggregation, that creates a bucket of all documents in the current document set context that are missing a field value (effectively, missing a field or having the configured NULL value set). This aggregator will often be used in conjunction with other field data bucket aggregators (such as ranges) to return information for all the documents that could not be placed in any of the other buckets due to missing field data values.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr The field we wish to investigate for missing values

Returns

'AggregationInterface' A new instance is created and returned

agg_nested(*aggregation_name: Optional[str], path: str)

A special single bucket aggregation that enables aggregating nested documents.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • pathstr The field of the nested document(s)

Returns

'AggregationInterface' A new instance is created and returned

agg_range(*aggregation_name: Optional[str], ranges: Sequence[Union[Mapping[str, Any], Any]], field: Optional[str] = None, keyed: bool = False, script: Optional[dict] = None)

A multi-bucket value source based aggregation that enables the user to define a set of ranges - each representing a bucket. During the aggregation process, the values extracted from each document will be checked against each bucket range and “bucket” the relevant/matching document.

Note

Note that this aggregation includes the from value and excludes the to value for each range.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • ranges

    Sequence[Union[Mapping[str, Any], Any]] List of ranges to define the buckets

    Example:

    [
        {"to": 10},
        {"from": 10, "to": 20},
        {"from": 20},
    ]
    

    Alternatively this parameter can be a list of strings. The above example can be rewritten as: [10, 20]

    Note

    This aggregation includes the from value and excludes the to value for each range.

  • fieldOptional[str] The field to index by the aggregation

  • keyedbool Setting the keyed flag to true associates a unique string key with each bucket and returns the ranges as a hash rather than an array.

  • scriptOptional[dict] Generating the terms using a script

Returns

'AggregationInterface' A new instance is created and returned

agg_rare_terms(*aggregation_name: Optional[str], field: str, max_doc_count: int = 1, include: Optional[Union[str, Sequence[str], Mapping[str, int]]] = None, exclude: Optional[Union[str, Sequence[str]]] = None, missing: Optional[Any] = None)

A multi-bucket value source based aggregation which finds “rare” terms — terms that are at the long-tail of the distribution and are not frequent. Conceptually, this is like a terms aggregation that is sorted by _count ascending. As noted in the terms aggregation docs, actually ordering a terms agg by count ascending has unbounded error. Instead, you should use the rare_terms aggregation.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr The field we wish to find rare terms in

  • max_doc_count

    int The maximum number of documents a term should appear in.

    The max_doc_count parameter is used to control the upper bound of document counts that a term can have. There is not a size limitation on the rare_terms agg like terms agg has. This means that terms which match the max_doc_count criteria will be returned. The aggregation functions in this manner to avoid the order-by-ascending issues that afflict the terms aggregation.

    This does, however, mean that a large number of results can be returned if chosen incorrectly. To limit the danger of this setting, the maximum max_doc_count is 100.

  • include

    Optional[Union[str, Sequence[str], Mapping[str, int]]] A regexp pattern that filters the documents which will be aggregated.

    Alternatively can be a list of strings.

    Parition expressions are also possible.

  • exclude

    Optional[Union[str, Sequence[str]]] A regexp pattern that filters the documents which will be aggregated.

    Alternatively can be a list of strings.

  • missingOptional[Any] The missing parameter defines how documents that are missing a value should be treated. By default they will be ignored but it is also possible to treat them as if they had a value.

Returns

'AggregationInterface' A new instance is created and returned

agg_sampler(*aggregation_name: Optional[str], shard_size: int = 100)

A filtering aggregation used to limit any sub aggregations’ processing to a sample of the top-scoring documents.

Example use cases:

  • Tightening the focus of analytics to high-relevance matches rather than the potentially very long tail of low-quality matches

  • Reducing the running cost of aggregations that can produce useful results using only samples e.g. significant_terms

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • shard_sizeint The shard_size parameter limits how many top-scoring documents are collected in the sample processed on each shard. The default value is 100.

Returns

'AggregationInterface' A new instance is created and returned

agg_significant_terms(*aggregation_name: Optional[str], field: str, size: int = 10, shard_size: Optional[int] = None, min_doc_count: int = 1, shard_min_doc_count: Optional[int] = None, execution_hint: str = 'global_ordinals', include: Optional[Union[str, Sequence[str], Mapping[str, int]]] = None, exclude: Optional[Union[str, Sequence[str]]] = None, script: Optional[dict] = None)

An aggregation that returns interesting or unusual occurrences of terms in a set.

Example use cases:

  • Suggesting “H5N1” when users search for “bird flu” in text

  • Identifying the merchant that is the “common point of compromise” from the transaction history of credit card owners reporting loss

  • Suggesting keywords relating to stock symbol $ATI for an automated news classifier

  • Spotting the fraudulent doctor who is diagnosing more than their fair share of whiplash injuries

  • Spotting the tire manufacturer who has a disproportionate number of blow-outs

In all these cases the terms being selected are not simply the most popular terms in a set. They are the terms that have undergone a significant change in popularity measured between a foreground and background set. If the term “H5N1” only exists in 5 documents in a 10 million document index and yet is found in 4 of the 100 documents that make up a user’s search results that is significant and probably very relevant to their search. 5/10,000,000 vs 4/100 is a big swing in frequency.

Warning

Picking a free-text field as the subject of a significant terms analysis can be expensive! It will attempt to load every unique word into RAM. It is recommended to only use this on smaller indices.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr

  • sizeint The size parameter can be set to define how many term buckets should be returned out of the overall terms list. By default, the node coordinating the search process will request each shard to provide its own top size term buckets and once all shards respond, it will reduce the results to the final list that will then be returned to the client. This means that if the number of unique terms is greater than size, the returned list is slightly off and not accurate (it could be that the term counts are slightly off and it could even be that a term that should have been in the top size buckets was not returned).

  • shard_size

    Optional[int] The higher the requested size is, the more accurate the results will be, but also, the more expensive it will be to compute the final results (both due to bigger priority queues that are managed on a shard level and due to bigger data transfers between the nodes and the client).

    The shard_size parameter can be used to minimize the extra work that comes with bigger requested size. When defined, it will determine how many terms the coordinating node will request from each shard. Once all the shards responded, the coordinating node will then reduce them to a final result which will be based on the size parameter - this way, one can increase the accuracy of the returned terms and avoid the overhead of streaming a big list of buckets back to the client.

  • min_doc_count

    int It is possible to only return terms that match more than a configured number of hits using the min_doc_count option. Default value is 1.

    Terms are collected and ordered on a shard level and merged with the terms collected from other shards in a second step. However, the shard does not have the information about the global document count available. The decision if a term is added to a candidate list depends only on the order computed on the shard using local shard frequencies. The min_doc_count criterion is only applied after merging local terms statistics of all shards. In a way the decision to add the term as a candidate is made without being very certain about if the term will actually reach the required min_doc_count. This might cause many (globally) high frequent terms to be missing in the final result if low frequent terms populated the candidate lists. To avoid this, the shard_size parameter can be increased to allow more candidate terms on the shards. However, this increases memory consumption and network traffic.

  • shard_min_doc_count

    Optional[int] The parameter shard_min_doc_count regulates the certainty a shard has if the term should actually be added to the candidate list or not with respect to the min_doc_count. Terms will only be considered if their local shard frequency within the set is higher than the shard_min_doc_count. If your dictionary contains many low frequent terms and you are not interested in those (for example misspellings), then you can set the shard_min_doc_count parameter to filter out candidate terms on a shard level that will with a reasonable certainty not reach the required min_doc_count even after merging the local counts. shard_min_doc_count is set to 0 per default and has no effect unless you explicitly set it.

    Note

    Setting min_doc_count=0 will also return buckets for terms that didn’t match any hit. However, some of the returned terms which have a document count of zero might only belong to deleted documents or documents from other types, so there is no warranty that a match_all query would find a positive document count for those terms.

    Warning

    When NOT sorting on doc_count descending, high values of min_doc_count may return a number of buckets which is less than size because not enough data was gathered from the shards. Missing buckets can be back by increasing shard_size. Setting shard_min_doc_count too high will cause terms to be filtered out on a shard level. This value should be set much lower than min_doc_count/#shards.

  • execution_hint

    str There are different mechanisms by which terms aggregations can be executed:

    • by using field values directly in order to aggregate data per-bucket (map)

    • by using global ordinals of the field and allocating one bucket per global ordinal (global_ordinals)

    Elasticsearch tries to have sensible defaults so this is something that generally doesn’t need to be configured.

    global_ordinals is the default option for keyword field, it uses global ordinals to allocates buckets dynamically so memory usage is linear to the number of values of the documents that are part of the aggregation scope.

    map should only be considered when very few documents match a query. Otherwise the ordinals-based execution mode is significantly faster. By default, map is only used when running an aggregation on scripts, since they don’t have ordinals.

  • include

    Optional[Union[str, Sequence[str], Mapping[str, int]]] A regexp pattern that filters the documents which will be aggregated.

    Alternatively can be a list of strings.

    Parition expressions are also possible.

  • exclude

    Optional[Union[str, Sequence[str]]] A regexp pattern that filters the documents which will be aggregated.

    Alternatively can be a list of strings.

  • scriptOptional[dict] Generating the terms using a script

Returns

'AggregationInterface' A new instance is created and returned

agg_terms(*aggregation_name: Optional[str], field: str, size: int = 10, shard_size: Optional[int] = None, show_term_doc_count_error: Optional[bool] = None, order: Optional[Union[Mapping, str]] = None, min_doc_count: int = 1, shard_min_doc_count: Optional[int] = None, include: Optional[Union[str, Sequence[str], Mapping[str, int]]] = None, exclude: Optional[Union[str, Sequence[str]]] = None, missing: Optional[Any] = None, script: Optional[dict] = None)

A multi-bucket value source based aggregation where buckets are dynamically built - one per unique value.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr

  • sizeint The size parameter can be set to define how many term buckets should be returned out of the overall terms list. By default, the node coordinating the search process will request each shard to provide its own top size term buckets and once all shards respond, it will reduce the results to the final list that will then be returned to the client. This means that if the number of unique terms is greater than size, the returned list is slightly off and not accurate (it could be that the term counts are slightly off and it could even be that a term that should have been in the top size buckets was not returned).

  • shard_size

    Optional[int] The higher the requested size is, the more accurate the results will be, but also, the more expensive it will be to compute the final results (both due to bigger priority queues that are managed on a shard level and due to bigger data transfers between the nodes and the client).

    The shard_size parameter can be used to minimize the extra work that comes with bigger requested size. When defined, it will determine how many terms the coordinating node will request from each shard. Once all the shards responded, the coordinating node will then reduce them to a final result which will be based on the size parameter - this way, one can increase the accuracy of the returned terms and avoid the overhead of streaming a big list of buckets back to the client.

  • show_term_doc_count_error

    Optional[bool] This shows an error value for each term returned by the aggregation which represents the worst case error in the document count and can be useful when deciding on a value for the shard_size parameter. This is calculated by summing the document counts for the last term returned by all shards which did not return the term.

    These errors can only be calculated in this way when the terms are ordered by descending document count. When the aggregation is ordered by the terms values themselves (either ascending or descending) there is no error in the document count since if a shard does not return a particular term which appears in the results from another shard, it must not have that term in its index. When the aggregation is either sorted by a sub aggregation or in order of ascending document count, the error in the document counts cannot be determined and is given a value of -1 to indicate this.

  • order

    Optional[Union[Mapping, str]] The order of the buckets can be customized by setting the order parameter. By default, the buckets are ordered by their doc_count descending.

    Warning

    Sorting by ascending _count or by sub aggregation is discouraged as it increases the error on document counts. It is fine when a single shard is queried, or when the field that is being aggregated was used as a routing key at index time: in these cases results will be accurate since shards have disjoint values. However otherwise, errors are unbounded. One particular case that could still be useful is sorting by min or max aggregation: counts will not be accurate but at least the top buckets will be correctly picked.

  • min_doc_count

    int It is possible to only return terms that match more than a configured number of hits using the min_doc_count option. Default value is 1.

    Terms are collected and ordered on a shard level and merged with the terms collected from other shards in a second step. However, the shard does not have the information about the global document count available. The decision if a term is added to a candidate list depends only on the order computed on the shard using local shard frequencies. The min_doc_count criterion is only applied after merging local terms statistics of all shards. In a way the decision to add the term as a candidate is made without being very certain about if the term will actually reach the required min_doc_count. This might cause many (globally) high frequent terms to be missing in the final result if low frequent terms populated the candidate lists. To avoid this, the shard_size parameter can be increased to allow more candidate terms on the shards. However, this increases memory consumption and network traffic.

  • shard_min_doc_count

    Optional[int] The parameter shard_min_doc_count regulates the certainty a shard has if the term should actually be added to the candidate list or not with respect to the min_doc_count. Terms will only be considered if their local shard frequency within the set is higher than the shard_min_doc_count. If your dictionary contains many low frequent terms and you are not interested in those (for example misspellings), then you can set the shard_min_doc_count parameter to filter out candidate terms on a shard level that will with a reasonable certainty not reach the required min_doc_count even after merging the local counts. shard_min_doc_count is set to 0 per default and has no effect unless you explicitly set it.

    Note

    Setting min_doc_count=0 will also return buckets for terms that didn’t match any hit. However, some of the returned terms which have a document count of zero might only belong to deleted documents or documents from other types, so there is no warranty that a match_all query would find a positive document count for those terms.

    Warning

    When NOT sorting on doc_count descending, high values of min_doc_count may return a number of buckets which is less than size because not enough data was gathered from the shards. Missing buckets can be back by increasing shard_size. Setting shard_min_doc_count too high will cause terms to be filtered out on a shard level. This value should be set much lower than min_doc_count/#shards.

  • include

    Optional[Union[str, Sequence[str], Mapping[str, int]]] A regexp pattern that filters the documents which will be aggregated.

    Alternatively can be a list of strings.

    Parition expressions are also possible.

  • exclude

    Optional[Union[str, Sequence[str]]] A regexp pattern that filters the documents which will be aggregated.

    Alternatively can be a list of strings.

  • missingOptional[Any] The missing parameter defines how documents that are missing a value should be treated. By default they will be ignored but it is also possible to treat them as if they had a value.

  • scriptOptional[dict] Generating the terms using a script

Returns

'AggregationInterface' A new instance is created and returned

aggregation(*aggregation_name_type, **params)elastipy.aggregation.aggregation.Aggregation[source]

Interface to create sub-aggregations.

This is the generic, undocumented version. Use the agg_*, metric_* and pipeline_* methods for convenience.

Parameters
  • aggregation_name_type – one or two strings, meaning either “type” or “name”, “type”

  • params – all parameters of the aggregation function

Returns

Aggregation instance

body_path()str[source]

Return the dotted path of this aggregation in the request body

Returns

str

property buckets

Returns the buckets of the aggregation response

Only available for bucket root aggregations!

Returns

dict or list

df(index: Union[bool, str] = False, to_index: Union[bool, str] = False, include: Optional[Union[str, Sequence[str]]] = None, exclude: Optional[Union[str, Sequence[str]]] = None, flat: Union[bool, str, Sequence[str]] = False, dtype=None, default=None)

Converts the results of dict_rows() to a pandas DataFrame.

This will include all parent aggregations (up to the root) and all children aggregations (including metrics).

Any columns containing dates will be automatically converted to pandas.Timestamp.

This method has a synonym: df

Parameters
  • index

    bool or str Sets a specific column as the index of the DataFrame.

    • If False no explicit index is set.

    • If True the root aggregation’s keys will be the index.

    • if str explicitly set a certain column as the DataFrame index.

    Note

    The column is kept in the DataFrame. If you wan’t to set a column as index and remove it from the columns, use to_index.

  • to_index

    bool or str Same as index but the column is removed from DataFrame.

    • If False no explicit index is set.

    • If True the root aggregation’s keys will be the index.

    • if str explicitly set a certain column as the DataFrame index.

  • includestr or list of str Can be one or more (OR-combined) wildcard patterns. If used, any column that does not fit a pattern is removed

  • excludestr or list of str Can be one or more (OR-combined) wildcard patterns. If used, any column that fits a pattern is removed

  • flat

    bool, str or sequence of str Can be one or more aggregation names that should be flattened out, meaning that each key of the aggregation creates a new column instead of a new row. If True, all bucket aggregations are flattened.

    Only supported for bucket aggregations!

    Note

    Currently not supported for the root aggregation!

  • dtype – Numpy data type to force. Only a single dtype is allowed. If None, infer.

  • default – This value will be used wherever a value is undefined.

Returns

pandas DataFrame instance

df_matrix(sort: Optional[Union[bool, str, int, Sequence[Union[str, int]]]] = None, default: Optional[Any] = None, include: Optional[Union[str, Sequence[str]]] = None, exclude: Optional[Union[str, Sequence[str]]] = None)

Returns a pandas DataFrame containing the matrix.

See to_matrix for details.

Only one- and two-dimensional matrices are supported.

Returns

pandas.DataFrame instance

Raises

ValueError – If dimensions is 0 or above 2

dict_rows(include: Optional[Union[str, Sequence[str]]] = None, exclude: Optional[Union[str, Sequence[str]]] = None, flat: Union[bool, str, Sequence[str]] = False)Iterable[dict]

Iterates through all result values from this aggregation branch.

This will include all parent aggregations (up to the root) and all children aggregations (including metrics and pipelines).

Parameters
  • includestr or sequence of str Can be one or more (OR-combined) wildcard patterns. If used, any column that does not fit a pattern is removed.

  • excludestr or sequence of str Can be one or more (OR-combined) wildcard patterns. If used, any column that fits a pattern is removed.

  • flat

    bool, str or sequence of str Can be one or more aggregation names that should be flattened out, meaning that each key of the aggregation creates a new column instead of a new row. If True, all bucket aggregations are flattened.

    Only supported for bucket aggregations!

    Note

    Currently not supported for the root aggregation!

Returns

generator of dict

property dump

Access to printing interface

Returns

AggregationDump instance

execute()[source]

Executes the whole Search with all contained aggregations.

Returns

self

property group

Returns the name of the aggregation group.

Returns

str, either “bucket”, “metric” or “pipeline”

items(key_separator: Optional[str] = None, tuple_key: bool = False, default=None)Iterable[Tuple]

Iterates through all key, value tuples.

Parameters
  • key_separatorstr Optional separator to concat multiple keys into one string.

  • tuple_key

    bool If True, the key is always a tuple.

    If False, the key is a string if there is only one key.

  • default – If not None any None-value will be replaced by this.

Returns

generator

key_name()str[source]

Return default name of the bucket key field.

Metrics return their parent’s key

Returns

str

keys(key_separator: Optional[str] = None, tuple_key: bool = False)

Iterates through all keys of this aggregation.

For example, a top-level terms aggregation would return all bucketed field values.

For a nested bucket aggregation each key is a tuple of all parent keys as well.

Parameters
  • key_separatorstr Optional separator to concat multiple keys into one string

  • tuple_keybool If True, the key is always a tuple If False, the key is a string if there is only one key

Returns

generator

metric(*aggregation_name_type, **params)

Alias for aggregation()

metric_avg(*aggregation_name: Optional[str], field: str, missing: Optional[Any] = None, script: Optional[dict] = None, return_self: bool = False)

A single-value metrics aggregation that computes the average of numeric values that are extracted from the aggregated documents. These values can be extracted either from specific numeric fields in the documents, or be generated by a provided script.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr

  • missingOptional[Any]

  • scriptOptional[dict]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_boxplot(*aggregation_name: Optional[str], field: str, compression: int = 100, missing: Optional[Any] = None, return_self: bool = False)

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr

  • compressionint

  • missingOptional[Any]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_cardinality(*aggregation_name: Optional[str], field: str, precision_threshold: int = 3000, missing: Optional[Any] = None, script: Optional[dict] = None, return_self: bool = False)

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr

  • precision_thresholdint

  • missingOptional[Any]

  • scriptOptional[dict]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_extended_stats(*aggregation_name: Optional[str], field: str, sigma: float = 3.0, missing: Optional[Any] = None, script: Optional[dict] = None, return_self: bool = False)

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr

  • sigmafloat

  • missingOptional[Any]

  • scriptOptional[dict]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_geo_bounds(*aggregation_name: Optional[str], field: str, wrap_longitude: bool = True, return_self: bool = False)

A metric aggregation that computes the bounding box containing all geo values for a field.

The Geo Bounds Aggregation is also supported on geo_shape fields.

If wrap_longitude is set to true (the default), the bounding box can overlap the international date line and return a bounds where the top_left longitude is larger than the top_right longitude.

For example, the upper right longitude will typically be greater than the lower left longitude of a geographic bounding box. However, when the area crosses the 180° meridian, the value of the lower left longitude will be greater than the value of the upper right longitude. See Geographic bounding box on the Open Geospatial Consortium website for more information.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr The field defining the geo_point or geo_shape

  • wrap_longitudebool An optional parameter which specifies whether the bounding box should be allowed to overlap the international date line. The default value is true.

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_geo_centroid(*aggregation_name: Optional[str], field: str, return_self: bool = False)

A metric aggregation that computes the weighted centroid from all coordinate values for geo fields.

The centroid metric for geo-shapes is more nuanced than for points. The centroid of a specific aggregation bucket containing shapes is the centroid of the highest-dimensionality shape type in the bucket. For example, if a bucket contains shapes comprising of polygons and lines, then the lines do not contribute to the centroid metric. Each type of shape’s centroid is calculated differently. Envelopes and circles ingested via the Circle are treated as polygons.

Warning

Using geo_centroid as a sub-aggregation of geohash_grid:

The geohash_grid aggregation places documents, not individual geo-points, into buckets. If a document’s geo_point field contains multiple values, the document could be assigned to multiple buckets, even if one or more of its geo-points are outside the bucket boundaries.

If a geocentroid sub-aggregation is also used, each centroid is calculated using all geo-points in a bucket, including those outside the bucket boundaries. This can result in centroids outside of bucket boundaries.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr The field defining the geo_point or geo_shape

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_matrix_stats(*aggregation_name: Optional[str], fields: list, mode: str = 'avg', missing: Optional[Any] = None, return_self: bool = False)

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldslist

  • modestr

  • missingOptional[Any]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_max(*aggregation_name: Optional[str], field: str, missing: Optional[Any] = None, script: Optional[dict] = None, return_self: bool = False)

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr

  • missingOptional[Any]

  • scriptOptional[dict]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_median_absolute_deviation(*aggregation_name: Optional[str], field: str, compression: int = 1000, missing: Optional[Any] = None, script: Optional[dict] = None, return_self: bool = False)

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr

  • compressionint

  • missingOptional[Any]

  • scriptOptional[dict]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_min(*aggregation_name: Optional[str], field: str, missing: Optional[Any] = None, script: Optional[dict] = None, return_self: bool = False)

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr

  • missingOptional[Any]

  • scriptOptional[dict]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_percentile_ranks(*aggregation_name: Optional[str], field: str, values: list, keyed: bool = True, hdr__number_of_significant_value_digits: Optional[int] = None, missing: Optional[Any] = None, script: Optional[dict] = None, return_self: bool = False)

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr

  • valueslist

  • keyedbool

  • hdr__number_of_significant_value_digitsOptional[int]

  • missingOptional[Any]

  • scriptOptional[dict]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_percentiles(*aggregation_name: Optional[str], field: str, percents: list = '(1, 5, 25, 50, 75, 95, 99)', keyed: bool = True, tdigest__compression: int = 100, hdr__number_of_significant_value_digits: Optional[int] = None, missing: Optional[Any] = None, script: Optional[dict] = None, return_self: bool = False)

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr

  • percentslist

  • keyedbool

  • tdigest__compressionint

  • hdr__number_of_significant_value_digitsOptional[int]

  • missingOptional[Any]

  • scriptOptional[dict]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_rate(*aggregation_name: Optional[str], unit: str, field: Optional[str] = None, script: Optional[dict] = None, return_self: bool = False)

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • unitstr

  • fieldOptional[str]

  • scriptOptional[dict]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_scripted_metric(*aggregation_name: Optional[str], map_script: str, combine_script: str, reduce_script: str, init_script: Optional[str] = None, params: Optional[dict] = None, return_self: bool = False)

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • map_scriptstr

  • combine_scriptstr

  • reduce_scriptstr

  • init_scriptOptional[str]

  • paramsOptional[dict]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_stats(*aggregation_name: Optional[str], field: str, missing: Optional[Any] = None, return_self: bool = False)

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr

  • missingOptional[Any]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_string_stats(*aggregation_name: Optional[str], field: str, show_distribution: bool = False, missing: Optional[Any] = None, return_self: bool = False)

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr

  • show_distributionbool

  • missingOptional[Any]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_sum(*aggregation_name: Optional[str], field: str, missing: Optional[Any] = None, script: Optional[dict] = None, return_self: bool = False)

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldstr

  • missingOptional[Any]

  • scriptOptional[dict]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_t_test(*aggregation_name: Optional[str], a__field: str, b__field: str, type: str, a__filter: Optional[dict] = None, b__filter: Optional[dict] = None, script: Optional[dict] = None, return_self: bool = False)

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • a__fieldstr

  • b__fieldstr

  • typestr

  • a__filterOptional[dict]

  • b__filterOptional[dict]

  • scriptOptional[dict]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_top_hits(*aggregation_name: Optional[str], size: int, sort: Optional[dict] = None, _source: Optional[dict] = None, return_self: bool = False)

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • sizeint

  • sortOptional[dict]

  • _sourceOptional[dict]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_top_metrics(*aggregation_name: Optional[str], metrics: dict, sort: Optional[dict] = None, return_self: bool = False)

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • metricsdict

  • sortOptional[dict]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_value_count(*aggregation_name: Optional[str], field: Optional[str] = None, script: Optional[dict] = None, return_self: bool = False)

A single-value metrics aggregation that counts the number of values that are extracted from the aggregated documents. These values can be extracted either from specific fields in the documents, or be generated by a provided script. Typically, this aggregator will be used in conjunction with other single-value aggregations. For example, when computing the avg one might be interested in the number of values the average is computed over.

value_count does not de-duplicate values, so even if a field has duplicates (or a script generates multiple identical values for a single document), each value will be counted individually.

Note

Because value_count is designed to work with any field it internally treats all values as simple bytes. Due to this implementation, if _value script variable is used to fetch a value instead of accessing the field directly (e.g. a “value script”), the field value will be returned as a string instead of it’s native format.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • fieldOptional[str] The field who’s values should be counted

  • scriptOptional[dict] Alternatively counting the values generated by a script

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metric_weighted_avg(*aggregation_name: Optional[str], value__field: str, weight__field: str, value__missing: Optional[Any] = None, weight__missing: Optional[Any] = None, format: Optional[str] = None, value_type: Optional[str] = None, script: Optional[dict] = None, return_self: bool = False)

A single-value metrics aggregation that computes the weighted average of numeric values that are extracted from the aggregated documents. These values can be extracted either from specific numeric fields in the documents.

When calculating a regular average, each datapoint has an equal “weight” …​ it contributes equally to the final value. Weighted averages, on the other hand, weight each datapoint differently. The amount that each datapoint contributes to the final value is extracted from the document, or provided by a script.

As a formula, a weighted average is the ∑(value * weight) / ∑(weight)

A regular average can be thought of as a weighted average where every value has an implicit weight of 1

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • value__fieldstr The field that values should be extracted from

  • weight__fieldstr The field that weights should be extracted from

  • value__missingOptional[Any] A value to use if the field is missing entirely

  • weight__missingOptional[Any] A weight to use if the field is missing entirely

  • formatOptional[str]

  • value_typeOptional[str]

  • scriptOptional[dict]

  • return_selfbool If True, this call returns the created metric, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

metrics()[source]

Iterate through all contained metric aggregations

Returns

generator of Aggregation

pipeline(*aggregation_name_type, **params)

Alias for aggregation()

pipeline_avg_bucket(*aggregation_name: Optional[str], buckets_path: str, gap_policy: str = 'skip', format: Optional[str] = None, return_self: bool = False)

A sibling pipeline aggregation which calculates the (mean) average value of a specified metric in a sibling aggregation. The specified metric must be numeric and the sibling aggregation must be a multi-bucket aggregation.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • buckets_path

    str The path to the buckets we wish to find the average for.

    See: bucket path syntax

  • gap_policy

    str The policy to apply when gaps are found in the data.

    See: gap policy

  • formatOptional[str] Format to apply to the output value of this aggregation

  • return_selfbool If True, this call returns the created pipeline, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

pipeline_bucket_script(*aggregation_name: Optional[str], script: str, buckets_path: Mapping[str, str], gap_policy: str = 'skip', format: Optional[str] = None, return_self: bool = False)

A parent pipeline aggregation which executes a script which can perform per bucket computations on specified metrics in the parent multi-bucket aggregation. The specified metric must be numeric and the script must return a numeric value.

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • scriptstr The script to run for this aggregation. The script can be inline, file or indexed. (see Scripting for more details)

  • buckets_pathMapping[str, str] A map of script variables and their associated path to the buckets we wish to use for the variable (see buckets_path Syntax for more details)

  • gap_policystr The policy to apply when gaps are found in the data (see Dealing with gaps in the data for more details)

  • formatOptional[str] Format to apply to the output value of this aggregation

  • return_selfbool If True, this call returns the created pipeline, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

pipeline_derivative(*aggregation_name: Optional[str], buckets_path: str, gap_policy: str = 'skip', format: Optional[str] = None, units: Optional[str] = None, return_self: bool = False)

A parent pipeline aggregation which calculates the derivative of a specified metric in a parent histogram (or date_histogram) aggregation. The specified metric must be numeric and the enclosing histogram must have min_doc_count set to 0 (default for histogram aggregations).

elasticsearch documentation

Parameters
  • aggregation_nameOptional[str] Optional name of the aggregation. Otherwise it will be auto-generated.

  • buckets_path

    str The path to the buckets we wish to find the average for.

    See: bucket path syntax

  • gap_policy

    str The policy to apply when gaps are found in the data.

    See: gap policy

  • formatOptional[str] Format to apply to the output value of this aggregation

  • unitsOptional[str] The derivative aggregation allows the units of the derivative values to be specified. This returns an extra field in the response normalized_value which reports the derivative value in the desired x-axis units.

  • return_selfbool If True, this call returns the created pipeline, otherwise the parent is returned.

Returns

'AggregationInterface' A new instance is created and attached to the parent and the parent is returned, unless ‘return_self’ is True, in which case the new instance is returned.

pipelines()[source]

Iterate through all contained pipeline aggregations

Returns

generator of Aggregation

property plot

Access to pandas plotting interface.

Returns

PandasPlotWrapper instance

property response

Returns the response object of the aggregation

Only available for root aggregations!

Returns

dict

rows(header: bool = True, include: Optional[Union[str, Sequence[str]]] = None, exclude: Optional[Union[str, Sequence[str]]] = None, flat: Union[bool, str, Sequence[str]] = False, default=None)Iterable[list]

Iterates through all result values from this aggregation branch.

Each row is a list. The first row contains the names if ‘header’ == True.

This will include all parent aggregations (up to the root) and all children aggregations (including metrics).

Parameters
  • headerbool If True, the first row contains the names of the columns

  • includestr or sequence of str Can be one or more (OR-combined) wildcard patterns. If used, any column that does not fit a pattern is removed.

  • excludestr or sequence of str Can be one or more (OR-combined) wildcard patterns. If used, any column that fits a pattern is removed.

  • flat

    bool, str or sequence of str Can be one or more aggregation names that should be flattened out, meaning that each key of the aggregation creates a new column instead of a new row. If True, all bucket aggregations are flattened.

    Only supported for bucket aggregations!

    Note

    Currently not supported for the root aggregation!

  • default – This value will be used wherever a value is undefined.

Returns

generator of list

to_body()[source]

Returns the part of the elasticsearch request body

Returns

dict

to_dict(key_separator=None, default=None)dict

Create a dictionary from all key/value pairs.

Parameters
  • key_separator – str, optional separator to concat multiple keys into one string

  • default – If not None any None-value will be replaced by this.

Returns

dict

to_matrix(sort: Optional[Union[bool, str, int, Sequence[Union[str, int]]]] = None, default: Optional[Any] = None, include: Optional[Union[str, Sequence[str]]] = None, exclude: Optional[Union[str, Sequence[str]]] = None)Tuple[List[str], List, List]

Generate an N-dimensional matrix from the values of this aggregation.

Each dimension corresponds to one of the parent bucket keys that lead to this aggregation.

The values are gathered through the Aggregation.items method. So the matrix values are either the doc_count of the bucket aggregation or the result of a metric or pipeline aggregation that is inside one of the bucket aggregations.

a = Search().agg_terms("color", field="color")
a = a.agg_terms("shape", field="shape")
...
names, keys, matrix = a.to_matrix()
names == ["color", "shape"]
keys == [["red", "green", "blue"], ["circle", "triangle"]]
matrix == [[23, 42], [84, 69], [4, 10]]
Parameters
  • sort

    Can sort one or several keys/axises.

    • True sorts all keys ascending

    • "-" sorts all keys descending

    • The name of an aggregation sorts it’s keys ascending. A “-” prefix sorts descending.

    • An integer defines the aggregation by index. Negative integers sort descending.

    • A sequence of strings or integers can sort multiple keys

    For example, agg.to_matrix(sort=(“color”, “-shape”, -4)) would sort the color keys ascending, the shape keys descending and the 4th aggregation -whatever that is- descending.

  • default – If not None any None-value will be replaced by this value

  • includestr | seq[str] One or more wildcard patterns that include matching keys. All other keys are removed from the output.

  • excludestr | seq[str] One or more wildcard patterns that exclude matching keys.

Returns

A tuple of names, keys and matrix data, each as list.

The names are the names of each aggregation that generates keys.

The keys are a list of lists, each corresponding to all the keys of each parent aggregation.

Data is a list, with other nested lists for each further dimension, containing the values of this aggregation.

Returns three empty lists if no data is available.

to_pandas(index: Union[bool, str] = False, to_index: Union[bool, str] = False, include: Optional[Union[str, Sequence[str]]] = None, exclude: Optional[Union[str, Sequence[str]]] = None, flat: Union[bool, str, Sequence[str]] = False, dtype=None, default=None)

Converts the results of dict_rows() to a pandas DataFrame.

This will include all parent aggregations (up to the root) and all children aggregations (including metrics).

Any columns containing dates will be automatically converted to pandas.Timestamp.

This method has a synonym: df

Parameters
  • index

    bool or str Sets a specific column as the index of the DataFrame.

    • If False no explicit index is set.

    • If True the root aggregation’s keys will be the index.

    • if str explicitly set a certain column as the DataFrame index.

    Note

    The column is kept in the DataFrame. If you wan’t to set a column as index and remove it from the columns, use to_index.

  • to_index

    bool or str Same as index but the column is removed from DataFrame.

    • If False no explicit index is set.

    • If True the root aggregation’s keys will be the index.

    • if str explicitly set a certain column as the DataFrame index.

  • includestr or list of str Can be one or more (OR-combined) wildcard patterns. If used, any column that does not fit a pattern is removed

  • excludestr or list of str Can be one or more (OR-combined) wildcard patterns. If used, any column that fits a pattern is removed

  • flat

    bool, str or sequence of str Can be one or more aggregation names that should be flattened out, meaning that each key of the aggregation creates a new column instead of a new row. If True, all bucket aggregations are flattened.

    Only supported for bucket aggregations!

    Note

    Currently not supported for the root aggregation!

  • dtype – Numpy data type to force. Only a single dtype is allowed. If None, infer.

  • default – This value will be used wherever a value is undefined.

Returns

pandas DataFrame instance

values(default=None)

Iterates through all values of this aggregation.

Parameters

default – If not None any None-value will be replaced by this.

Returns

generator

printing utilities

class elastipy.aggregation.aggregation_dump.AggregationDump(agg: elastipy.aggregation.aggregation.Aggregation)[source]

Bases: object

dict(key_separator: str = '|', default: Optional[Any] = None, indent: int = 2, file: Optional[TextIO] = None)[source]

Print the result of Aggregation.to_dict to console.

Parameters
  • key_separatorstr Separator to concat multiple keys into one string. Defaults to |

  • default – If not None any None-value will be replaced by this.

  • indent – The json indentation, defaults to 2.

  • file – Optional output stream.

hbar(width: Optional[int] = None, zero_based: bool = True, digits: int = 3, ascii: bool = False, colors: bool = True, file: Optional[TextIO] = None)[source]

Print a horizontal bar graphic based on Aggregation.keys() and values() to console.

Parameters
  • widthint Maximum width to use. Will be auto-detected if None.

  • zero_basedbool If True start at bars at tero, instead of global minimum

  • digitsint Optional number of digits for rounding.

  • colorsbool Enable console colors.

  • asciibool If True fall back to ascii characters.

  • file – Optional text stream to print to.

heatmap(sort: Optional[Union[bool, str, int, Sequence[Union[str, int]]]] = None, default: Optional[Any] = None, include: Optional[Union[str, Sequence[str]]] = None, exclude: Optional[Union[str, Sequence[str]]] = None, colors: bool = True, ascii: bool = False, **kwargs)[source]

Prints a heat-map from a two-dimensional matrix.

Parameters
  • sort

    Can sort one or several keys/axises.

    • True sorts all keys ascending

    • "-" sorts all keys descending

    • The name of an aggregation sorts it’s keys ascending. A “-” prefix sorts descending.

    • An integer defines the aggregation by index. Negative integers sort descending.

    • A sequence of strings or integers can sort multiple keys

    For example, agg.heatmap(sort=(“color”, “-shape”, -4)) would sort the color keys ascending, the shape keys descending and the 4th aggregation -whatever that is- descending.

  • default – If not None any None-value will be replaced by this value

  • includestr | seq[str] One or more wildcard patterns that include matching keys. All other keys are removed from the output.

  • excludestr | seq[str] One or more wildcard patterns that exclude matching keys.

  • colorsbool Enable console colors.

  • asciibool If True fall back to ascii characters.

  • max_widthint Will limit the expansion of the table when bars are enabled. If left None, the terminal width is used.

  • file – Optional text stream to print to.

  • kwargs – TODO list all Heatmap parameters

matrix(indent: int = 2, file: Optional[TextIO] = None, **kwargs)[source]

Print a representation of Aggregation.to_matrix() to console.

Parameters
  • indent – The json indentation, defaults to 2.

  • file – Optional output stream.

  • kwargs – TODO: list additional to_matrix parameters

table(include: Optional[Union[str, Sequence[str]]] = None, exclude: Optional[Union[str, Sequence[str]]] = None, flat: Union[bool, str, Sequence[str]] = False, sort: Optional[str] = None, digits: Optional[int] = None, header: bool = True, bars: bool = True, zero: Union[bool, float] = True, colors: bool = True, ascii: bool = False, max_width: Optional[int] = None, max_bar_width: int = 40, file: Optional[TextIO] = None)[source]

Print the result of the Aggregation.dict_rows() function as table to console.

Parameters
  • includestr or sequence of str Can be one or more (OR-combined) wildcard patterns. If used, any column that does not fit a pattern is removed.

  • excludestr or sequence of str Can be one or more (OR-combined) wildcard patterns. If used, any column that fits a pattern is removed.

  • flat

    bool, str or sequence of str Can be one or more aggregation names that should be flattened out, meaning that each key of the aggregation creates a new column instead of a new row. If True, all bucket aggregations are flattened.

    Only supported for bucket aggregations!

    Note

    Currently not supported for the root aggregation!

  • sortstr Optional sort column name which must match a ‘header’ key. Can be prefixed with - (minus) to reverse order

  • digitsint Optional number of digits for rounding.

  • headerbool if True, include the names in the first row.

  • barsbool Enable display of horizontal bars in each number column. The table width will stretch out in size while limited to ‘max_width’ and ‘max_bar_width’

  • zero

    • If True: the bar axis starts at zero (or at a negative value if appropriate).

    • If False: the bar starts at the minimum of all values in the column.

    • If a number is provided, the bar starts there, regardless of the minimum of all values.

  • colorsbool Enable console colors.

  • asciibool If True fall back to ascii characters.

  • max_widthint Will limit the expansion of the table when bars are enabled. If left None, the terminal width is used.

  • max_bar_widthint The maximum size a bar should have

  • file – Optional text stream to print to.

plotting

class elastipy.plot.aggregation_plot_pd.PandasPlotWrapper(agg: elastipy.aggregation.aggregation.Aggregation)[source]

Bases: object

This is a short-hand accessor to the pandas.DataFrame.plot interface.

The documented parameters below will be passed to Aggregation.to_pandas. All other parameters are passed to the respective functions of the pandas interface.

s = Search()
s.agg_terms("idx", field="a").execute().plot(
    to_index="idx",
    kind="bar",
)
Parameters
  • index

    bool or str Sets a specific column as the index of the DataFrame.

    • If False no explicit index is set.

    • If True the root aggregation’s keys will be the index.

    • if str explicitly set a certain column as the DataFrame index.

    Note

    The column is kept in the DataFrame. If you wan’t to set a column as index and remove it from the columns, use to_index.

  • to_index

    bool or str Same as index but the column is removed from DataFrame.

    • If False no explicit index is set.

    • If True the root aggregation’s keys will be the index.

    • if str explicitly set a certain column as the DataFrame index.

  • includestr or list of str Can be one or more (OR-combined) wildcard patterns. If used, any column that does not fit a pattern is removed

  • excludestr or list of str Can be one or more (OR-combined) wildcard patterns. If used, any column that fits a pattern is removed

  • flat

    bool, str or sequence of str Can be one or more aggregation names that should be flattened out, meaning that each key of the aggregation creates a new column instead of a new row. If True, all bucket aggregations are flattened.

    Only supported for bucket aggregations!

    Note

    Currently not supported for the root aggregation!

  • dtype – Numpy data type to force. Only a single dtype is allowed. If None, infer.

  • default – This value will be used wherever a value is undefined.

Returns

matplotlib.axes.Axes or numpy.ndarray of them If the backend is not the default matplotlib one, the return value will be the object returned by the backend.

area(x=None, y=None, **kwargs)[source]

Draw a stacked area plot.

See pandas.DataFrame.plot.area

bar(x=None, y=None, **kwargs)[source]

Vertical bar plot.

See pandas.DataFrame.plot.bar

barh(x=None, y=None, **kwargs)[source]

Horizontal bar plot.

See pandas.DataFrame.plot.barh

box(by=None, **kwargs)[source]

Make a box plot of the DataFrame columns.

See pandas.DataFrame.plot.box

heatmap(sort: Optional[Union[bool, str, int, Sequence[Union[str, int]]]] = None, default: Optional[Any] = None, replace=None, include: Optional[Union[str, Sequence[str]]] = None, exclude: Optional[Union[str, Sequence[str]]] = None, transpose: bool = False, figsize: Optional[Tuple[Union[int, float], Union[int, float]]] = None, **kwargs)[source]

Plots a heatmap using the data from Aggregation.df_matrix.

Pandas’ default plotting backend is matplotlib. In this case the seaborn.heatmap. is used and the seaborn package must be installed along with pandas and matplotlib.

The `ploty backend <>`__ is also supported in which case the plotly.express.imshow <https://plotly.com/python/imshow/> function is used.

In matplotlib-mode, the figsize parameter will create a new Axes before calling seaborn.heatmap. For plotly it’s ignored.

The documented parameters below are passed to Aggregation.df_matrix, generating a pandas.DataFrame. All other parameters are passed to the heatmap function.

In matplotlib-mode, the figsize parameter will create a new Axes before calling seaborn.heatmap. For plotly it’s ignored.

Labels can be defined in plotly with the labels parameter, e.g. labels={"x": "date", "y": "temperature", "color": "date.doc_count"}. If labels or any of the keys are not defined they will be set to the name of each aggregation. color will either be <bucket-agg-name>.doc_count or <metric-name> (or pipeline).

Parameters
  • sort

    Can sort one or several keys/axises.

    • True sorts all keys ascending

    • "-" sorts all keys descending

    • The name of an aggregation sorts it’s keys ascending. A “-” prefix sorts descending.

    • An integer defines the aggregation by index. Negative integers sort descending.

    • A sequence of strings or integers can sort multiple keys

    For example, agg.to_matrix(sort=(“color”, “-shape”, -4)) would sort the color keys ascending, the shape keys descending and the 4th aggregation -whatever that is- descending.

  • default – If not None any None-value will be replaced by this value

  • includestr | seq[str] One or more wildcard patterns that include matching keys. All other keys are removed from the output.

  • excludestr | seq[str] One or more wildcard patterns that exclude matching keys.

  • replace

    str, regex, list, dict, Series, int, float, or None

    If not None, the pandas.DataFrame.replace function will be called with this parameter as the to_replace parameter.

:param transpose bool

Transposes the matrix, e.g. exchanges X and Y axis.

Parameters
  • figsizetuple of ints or floats Optional tuple to change the size of the plot when the plotting backend is matplotlib. int values will be passed to matplotlib.axes.Axes unchanged. A float value defines the size in terms of the number of keys per axis and is converted to int with int(len(keys) * value)

  • kwargs – Passed to seaborn.heatmap()

Returns

matplotlib.axes.Axes Axis object with the heatmap.

hexbin(x, y, C=None, reduce_C_function=None, gridsize=None, **kwargs)[source]

Generate a hexagonal binning plot.

See pandas.DataFrame.plot.hexbin

hist(by=None, bins=10, **kwargs)[source]

Draw one histogram of the DataFrame’s columns.

See pandas.DataFrame.plot.hist

kde(bw_method=None, ind=None, **kwargs)[source]

Generate Kernel Density Estimate plot using Gaussian kernels.

See pandas.DataFrame.plot.kde

line(x=None, y=None, **kwargs)[source]

Plot Series or DataFrame as lines.

See pandas.DataFrame.plot.line

pie(**kwargs)[source]

Generate a pie plot.

See pandas.DataFrame.plot.pie

scatter(x, y, s=None, c=None, **kwargs)[source]

Create a scatter plot with varying marker point size and color.

See pandas.DataFrame.plot.scatter

Exporter

A base class to help export documents to elasticsearch.

reference

class elastipy.Exporter(client=None, index_prefix: Optional[str] = None, index_postfix: Optional[str] = None, update_index: bool = True)[source]

Bases: object

Base class helper to export stuff to elasticsearch.

Derive from class and define class attributes:

  • INDEX_NAME: str Name of index, might contain a wildcard *

  • MAPPINGS: dict The mapping definition for the index.

And optionally override methods:

property client

Access to the elasticsearch client. If none was defined in constructor then elastipy.connections.get("default") is returned.

delete_index()bool[source]

Try to delete the index. Ignore if not found.

Returns

bool True if deleted, False otherwise.

If the index name contains a wildcard *, True is always returned.

export_list(object_list: Iterable[Any], chunk_size: int = 500, refresh: bool = False, verbose: bool = False, verbose_total: Optional[int] = None, file=None, **kwargs)[source]

Export a list of objects.

Parameters
  • object_listsequence of dict This can be a list or generator of dictionaries, containing the objects that should be exported.

  • chunk_sizeint Number of objects per bulk request.

  • refreshbool if True require the immediate refresh of the index when finished exporting.

  • verbosebool If True print some progress to stderr (using tqdm if present)

  • verbose_totalint Provide the number of objects for the verbosity if object_list is a generator.

  • file – Optional string stream to output verbose info, default is stderr.

All other parameters are passed to elasticsearch.helpers.bulk

Returns

dict Response of elasticsearch bulk call.

get_document_id(es_data: Mapping)[source]

Override this to return a single elasticsearch object’s id.

Parameters

es_datadict Single object as returned by transform_document()

Returns

str, int etc..

get_document_index(es_data: Mapping)str[source]

Override to define an index per document.

The default function returns the result from index_name() but it’s possible to put objects into separate indices.

For example you might define INDEX_NAME = "documents-*"

and get_document_index might return

self.index_name().replace("*", es_data["type"]
Parameters

es_datadict Single document as returned by transform_document()

Returns

str

get_index_params()dict[source]

Returns the complete index parameters.

Override if you need to specialize things.

Returns

dict

index_name()str[source]

Returns the configured index_prefix - INDEX_NAME - index_suffix

Returns

str

search(**kwargs)elastipy.search.Search[source]

Return a new Search object for this index and client.

Returns

Search instance

transform_document(data: Mapping)Union[Mapping, Iterator[Mapping]][source]

Override this to transform each documents’s data into an elasticsearch document.

It’s possible to return a list or yield multiple elasticsearch documents.

Parameters

data – dict

Returns

dict or iterable of dict

update_index()None[source]

Create the index or update changes to the mapping.

Can only be called if INDEX_NAME does not contain a '*' :return: None

import numpy as np
from elastipy import Search, query

They always say: put the imports at the top!

git commit analytics

Below we use a lot of pandas and plotting to get insight into the community of an open source project.

To explore a repository of your choice move to elastipy/examples/ and call:

python gitlogs.py <project-name> path/to/git-repo

If you are cloning a repository and are just interested in commits you can somewhat limit the size on disk with:

git clone <repo-url> --no-checkout

Replace the <project-name> with the name of the project and change the value below in the notebook:

PROJECT = "pandas"

def search():
    return Search(f"elastipy-example-commits-{PROJECT}")

activity

commits per week

s = search()
agg = s.agg_date_histogram("date", calendar_interval="week")
df = agg.execute().df(to_index=True)
df["commits/week"] = df.pop("date.doc_count")
df["smooth"] = df.rolling(window=50).mean()
df.plot(figsize=(15,4), color=["lightblue", "blue"])
_images/gitlogs_7_1.png

additions/deletions per week

s = search()
agg = s.agg_date_histogram("date", calendar_interval="month")
agg.metric_sum("add", field="changes.additions")
agg.metric_sum("del", field="changes.deletions")
df = agg.execute().df(to_index=True, exclude="*doc_count")
#df = df.rolling(window=10).mean()[["add", "del"]]
df.plot.line(color=["green", "pink"], figsize=(15,4))
_images/gitlogs_9_1.png

commits per weekday/hour for each year

def commits_per(field, interval="year"):
    s = search()
    agg = s.agg_date_histogram(interval, calendar_interval=interval)
    #agg = s.agg_terms("author", field="author")
    agg = agg.agg_terms("weekday", field=field, size=100)
    agg.execute().plot.heatmap(
        sort=True, transpose=True,
        annot=False, fmt=".0f", cmap="gray_r", figsize=(15, .3),
    )
commits_per("timestamp_weekday")
commits_per("timestamp_hour")
_images/gitlogs_11_0.png _images/gitlogs_11_1.png

authors

top 3 authors per year

s = search()
agg = s.agg_date_histogram("date", calendar_interval="year")
agg = agg.agg_terms("author", field="author", size=3)
agg_top3_authors = agg
agg.execute().df(to_index=True, flat="author", exclude="*doc_count").plot.bar(figsize=(15,4), stacked=True)
_images/gitlogs_14_1.png
commits of all top 3 authors
top_authors = set(k[1] for k in agg_top3_authors.keys())

s = search()
agg = s.agg_filters("author", filters={key: query.Term("author", key) for key in top_authors})
agg = agg.agg_date_histogram("date", calendar_interval="year")
agg.execute().plot.heatmap(
    sort=True, replace={0: np.nan},
    annot=True, fmt=".0f", figsize=(15, .6), cmap="gray_r"
)
_images/gitlogs_16_1.png

top 3 average-additions per author per year

s = search()
agg = s.agg_filters("author", filters={key: query.Term("author", key) for key in top_authors})
agg = agg.agg_date_histogram("date", calendar_interval="year")
agg = agg.metric_avg("avg-add", field="changes.additions", return_self=True)
agg.execute().plot.heatmap(
    sort=True, replace={0: np.nan},
    annot=True, fmt=".0f", figsize=(15, .6), cmap="gray_r"
)
_images/gitlogs_18_1.png

number of authors per year

s = search()
global_authors = s.metric_cardinality(field="author", return_self=True)
agg = s.agg_date_histogram("year", calendar_interval="year")
agg = agg.metric_cardinality("authors", field="author")
agg.execute().plot.bar("year", "authors", figsize=(15, 4))
print(next(global_authors.values()), "authors at all")
2504 authors at all
_images/gitlogs_20_1.png

commit messages

the first ten commit messages

s = search().sort("timestamp")
# s = s.range("timestamp", gte="2020")
for d in s.execute().documents:
    print(("-- %(timestamp)s %(hash)s\n%(message)s" % d).strip() + "\n")
-- 2009-07-31T15:07:16+00:00 9d0080576446de475d34b0dbb58389b15cd4f529
Initial directory structure.

git-svn-id: http://pandas.googlecode.com/svn/trunk@1 d5231056-7de3-11de-ac95-d976489f1ece

-- 2009-08-05T02:32:49+00:00 ec1a0a2a2571dc2c1c26612b374d4a66b22f0938
adding trunk

git-svn-id: http://pandas.googlecode.com/svn/trunk@2 d5231056-7de3-11de-ac95-d976489f1ece

-- 2009-08-05T02:33:13+00:00 1eeadf4e401647faa20911f531bc05c1872262ea
oops

git-svn-id: http://pandas.googlecode.com/svn/trunk@3 d5231056-7de3-11de-ac95-d976489f1ece

-- 2009-08-05T03:17:29+00:00 445114e1b20da8d4976c8d9050aa90c5bd508c54
added svn:ignore

git-svn-id: http://pandas.googlecode.com/svn/trunk@4 d5231056-7de3-11de-ac95-d976489f1ece

-- 2009-08-05T03:30:16+00:00 c6b236db73ff81007909be6406f0e484edc4a9eb
first commit with cleaned up code

git-svn-id: http://pandas.googlecode.com/svn/trunk@5 d5231056-7de3-11de-ac95-d976489f1ece

-- 2009-08-05T03:40:05+00:00 c8efebf2bfbe6a1efc732679ad3cf2d06d795c3f
minor edit

git-svn-id: http://pandas.googlecode.com/svn/trunk@6 d5231056-7de3-11de-ac95-d976489f1ece

-- 2009-08-05T03:54:33+00:00 21e01d94a0632539f76eb702408540b0d9adcb59
fixed isinf reference

git-svn-id: http://pandas.googlecode.com/svn/trunk@7 d5231056-7de3-11de-ac95-d976489f1ece

-- 2009-09-01T15:10:47+00:00 0f6d8b435670053a393b65c621d6eab090a36633
latest edits, miscellaneous cleanup and bug fixes from development

git-svn-id: http://pandas.googlecode.com/svn/trunk@8 d5231056-7de3-11de-ac95-d976489f1ece

-- 2009-09-01T15:13:32+00:00 171487fd4ea85aa38b224ee3cd5c41356063e197
added stats empty directory

git-svn-id: http://pandas.googlecode.com/svn/trunk@9 d5231056-7de3-11de-ac95-d976489f1ece

-- 2009-09-01T15:50:21+00:00 39c033cbe697b488f6f612c9d154a467aaca76a1
fixed inconsistency with dateCol parameter

git-svn-id: http://pandas.googlecode.com/svn/trunk@10 d5231056-7de3-11de-ac95-d976489f1ece

significant terms by year

def significant_terms_by_year(s, field, size=4, shard_size=100):
    agg = s.copy().agg_date_histogram("year", calendar_interval="year")
    agg = agg.agg_significant_terms(field=field, size=size, shard_size=shard_size)
    keywords = set(k[-1] for k in agg.execute().keys())

    agg = s.agg_date_histogram("date", calendar_interval="year")
    agg = agg.agg_filters("word", filters={key: query.Term(field, key) for key in keywords})
    agg.execute().plot.heatmap(
        sort=True, replace={0: np.nan},
        transpose=True, annot=True, fmt=".0f", figsize=(.3, .7), cmap="gray_r"
    )

significant_terms_by_year(search(), "message")
_images/gitlogs_26_0.png

significant terms by author

def significant_terms_by_terms(s, split_field, terms_field, split_size=30, size=3, shard_size=100):
    agg = s.copy().agg_terms(split_field, field=split_field, size=split_size)
    agg = agg.agg_significant_terms("term", field=terms_field, size=size, shard_size=shard_size)
    df = agg.execute().df(include=["term", "term.doc_count"])

    # find max count of all significant terms
    df = df.groupby("term").max()
    # print(df.describe())

    # and drop everything above a high percentile
    df = df[df < df.quantile(.8)].dropna()
    keywords = list(df.index)

    agg = s.agg_terms(split_field, field=split_field, size=split_size)
    agg = agg.agg_filters("term", filters={key: query.Term(terms_field, key) for key in keywords})
    agg.execute().plot.heatmap(
        sort=True, transpose=True, replace={0: np.nan},
        annot=True, fmt=".0f", figsize=(.23, .6), cmap="gray_r"
    )

significant_terms_by_terms(search(), "author", "message")
_images/gitlogs_28_0.png

files

overall top 50 edited files per year

s = search()
agg = s.agg_terms(field="changes.file", size=50)
agg = agg.agg_date_histogram("date", calendar_interval="year")
df = agg.execute().plot.heatmap(
    sort=True, replace={0: np.nan},
    annot=True, fmt=".0f", figsize=(.3, 1.5), cmap="gray_r"
)
_images/gitlogs_31_0.png

significant changed files by year

s = search().param(rest_total_hits_as_int=True)
# remove version specific files
s = ~s.query_string("changes.file: *.txt *.rst")
significant_terms_by_year(s, "changes.file")
_images/gitlogs_33_0.png

significant changed files by author

significant_terms_by_terms(search(), "author", "changes.file")
_images/gitlogs_35_0.png

which files get edited together

s = search()
s = s.query_string("changes.file: __init__.py")

agg = s.agg_terms(field="changes.file", size=50)
agg = agg.agg_date_histogram("date", calendar_interval="year")
try:
    agg.execute().plot.heatmap(figsize=(.3, 1.5), cmap="gray_r")
except ValueError:
    pass
_images/gitlogs_37_0.png

Plotting maps

Here are examples to plot geographic data using plotly and matplotlib. Matplotlib is probably the choice if you need a rendered image. Plotly creates interactive plots and has a more modern interface.

To handle the different geo-types returned by elasticsearch we first look at conversion utilities. Skip it if you just want to see pretty images.

Coordinate conversion

A metric aggregation like geo_centroid already returns latitude and longitude values.

Bucket-aggregations like geotile_grid and geohash_grid return keys that can be mapped to geo-coordinates.

map-tiles

The geotile_grid aggregation uses map-tiles (wikipedia) as bucket keys. They represent zoom/x/y as seen below:

from elastipy import Search

s = Search(index="elastipy-example-car-accidents")

agg = s.agg_geotile_grid("tiles", field="location", precision=6)

agg.execute().to_dict()
{'6/33/21': 131436,
 '6/34/21': 36158,
 '6/33/20': 35218,
 '6/33/22': 32519,
 '6/34/22': 19237,
 '6/34/20': 13802}

To convert the keys to geo-coordinates we can use a helper function in elastipy:

from elastipy import geotile_to_lat_lon

{
    geotile_to_lat_lon(key): value
    for key, value in agg.items()
}
{(50.736455137010644, 8.4375): 131436,
 (50.736455137010644, 14.0625): 36158,
 (54.162433968067795, 8.4375): 35218,
 (47.04018214480665, 8.4375): 32519,
 (47.04018214480665, 14.0625): 19237,
 (54.162433968067795, 14.0625): 13802}

Becaue the tiles are actually areas the latitude and longitude just represent a single point within the area. The point can be defined as the offset parameter and defaults to (.5, .5) which is the center of the tile.

Here we print the top-left and bottom-right coordinates for each map-tile:

for key, value in agg.items():
    tl = geotile_to_lat_lon(key, offset=(0, 1))
    bl = geotile_to_lat_lon(key, offset=(1, 0))
    print(f"{tl} - {bl}: {value}")
(48.92249926375824, 5.625) - (52.48278022207821, 11.25): 131436
(48.92249926375824, 11.25) - (52.48278022207821, 16.875): 36158
(52.48278022207821, 5.625) - (55.77657301866769, 11.25): 35218
(45.08903556483103, 5.625) - (48.92249926375824, 11.25): 32519
(45.08903556483103, 11.25) - (48.92249926375824, 16.875): 19237
(52.48278022207821, 11.25) - (55.77657301866769, 16.875): 13802

geohash

The geohash_grid aggregation returns geohash (wikipedia) bucket keys.

from elastipy import Search

s = Search(index="elastipy-example-car-accidents")

agg = s.agg_geohash_grid("tiles", field="location", precision=2)

agg.execute().to_dict()
{'u1': 113676, 'u0': 85497, 'u3': 41653, 'u2': 27544}

The pygeohash package can be used to translate them:

import pygeohash

{
    pygeohash.decode(key): value
    for key, value in agg.items()
}
{(53.0, 6.0): 113676,
 (48.0, 6.0): 85497,
 (53.0, 17.0): 41653,
 (48.0, 17.0): 27544}

For convenience the pygeohash function is wrapped by elastipy.geohash_to_lat_lon.

plotly backend

The plotly python library enables creating browser-based plots in python. It supports a range of map plots. In particular the mapbox based plots are interesting because they use WebGL and render quite fast even for a large number of items.

geo-centroid

Let’s plot an overview of the german car accidents (included in elastipy examples).

s = Search(index="elastipy-example-car-accidents")
agg = s.agg_terms("city", field="city", size=10000)
agg = agg.metric_geo_centroid("location", field="location")

df = agg.execute().df()
print(f"{df.shape[0]} cities")
df.head()
8451 cities
city city.doc_count location.lat location.lon
0 München 4979 48.145224 11.558930
1 Köln 4562 50.940086 6.961585
2 Frankfurt am Main 2639 50.117909 8.653241
3 Bremen 2459 53.091255 8.800806
4 Düsseldorf 2390 51.224550 6.799716

The geo_centroid aggregation above returns the center coordinate of all accidents within a city. (It’s not necessarily the center of the city but the centroid of all accidents that are assigned to the city.)

Below we pass the pandas DataFrame to the plotly express function and tell it the names of the latitude and longitude columns. The number of accidents per city is also used for the color and size of the points.

import plotly.express as px

fig = px.scatter_mapbox(
    df,
    lat="location.lat", lon="location.lon",
    color="city.doc_count", opacity=.5, size="city.doc_count",
    zoom=4.8,
    mapbox_style="carto-positron",
    hover_data=["city"],
    labels={"city.doc_count": "number of accidents"},

)
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})

The most amazing thing we should notice is that the federal state Mecklenburg-Vorpommern does not have any accidents! 🍀

density heatmap

The plotly express tools are just lovely ♥ ❤️ ♥ ❤️

fig = px.density_mapbox(
    df,
    lat="location.lat", lon="location.lon",
    z="city.doc_count",
    zoom=4.8,
    mapbox_style="carto-positron",
    hover_data=["city"],
    labels={"city.doc_count": "number of accidents"},
)
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})

geohash_grid aggregation

Below is the same data-set but aggregated with the geohash_grid aggregation.

import plotly.graph_objects as go
import plotly.express as px

from elastipy import geotile_to_lat_lon

s = Search(index="elastipy-example-car-accidents")
agg = s.agg_geotile_grid("location", field="location", precision=10, size=1000)

df = agg.execute().df()

# put lat and lon columns into dataframe
df[["lat", "lon"]] = list(df["location"].map(geotile_to_lat_lon))
print(df.head())

fig = px.scatter_mapbox(
    df,
    lat="lat", lon="lon",
    color="location.doc_count", opacity=.5, size="location.doc_count",
    mapbox_style="carto-positron",
    zoom=5,
    labels={"location.doc_count": "number of accidents"},

)
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
     location  location.doc_count        lat        lon
0  10/550/335                6468  52.589701  13.535156
1  10/540/330                5817  53.644638  10.019531
2  10/544/355                5021  48.107431  11.425781
3  10/549/335                4314  52.589701  13.183594
4  10/531/340                4242  51.508742   6.855469

geotile_grid aggregation

Let’s see if we can do something with the geotile_grid aggregation. The lengthy function in the middle builds a list of lines connecting each corner in each returned map-tile.

Unfortunately, the fillcolor in mapbox can only be one fixed color and does not support color scaling (like the marker).
If you know differently or have an idea how to color the rendered tiles according to aggregated values, please let me know.
import plotly.graph_objects as go
import plotly.colors

from elastipy import Search, geotile_to_lat_lon

s = Search(index="elastipy-example-car-accidents")

agg = s.agg_geotile_grid(
    "location",
    field="location", precision=8, size=1000,
)
agg.execute()

lat, lon = [], []
for key, value in agg.items():
    tl = geotile_to_lat_lon(key, offset=(0, 1))
    tr = geotile_to_lat_lon(key, offset=(1, 1))
    bl = geotile_to_lat_lon(key, offset=(0, 0))
    br = geotile_to_lat_lon(key, offset=(1, 0))
    lat += [tl[0], tr[0], br[0], bl[0], tl[0], None]
    lon += [tl[1], tr[1], br[1], bl[1], tl[1], None]

fig = go.Figure(go.Scattermapbox(
    lat=lat, lon=lon,
    fill="toself",
    fillcolor="rgba(0,0,0,.1)",
))
fig.update_layout(
    mapbox=dict(
        style="carto-positron",
        zoom=5,
        center=dict(lat=51., lon=10.3),
    ),
    margin={"r": 0, "t": 0, "l": 0, "b": 0},
)

matplotlib backend

Matplotlib does not come with specific geo functionality out-of-the-box. Instead a couple of additional libraries must be used.

geotile_grid aggregation

Here is an example using geopandas. It extends the pandas.DataFrame with the geopandas.GeoDataFrame class.

The GeoDataFrame will pick the "geometry" column from a DataFrame by default. The values must be shapely geometries.

from shapely.geometry import Point
import geopandas
import matplotlib.pyplot as plt
import matplotlib.colors

from elastipy import Search, geotile_to_lat_lon

s = Search(index="elastipy-example-car-accidents")
agg = s.agg_geotile_grid("location", field="location", precision=10)

df = agg.execute().df()

# take hash from location column,
#   convert to latitude and longitude
#   and create a shapely.Point
#   (which expects longitude, latitude)
df["geometry"] = df.pop("location").map(
    lambda v: Point(geotile_to_lat_lon(v)[::-1])
)

# have a color for each point with matplotlib tools
cmap = plt.cm.magma
norm = matplotlib.colors.Normalize(
    df["location.doc_count"].min(), df["location.doc_count"].max()
)
df["color"] = df["location.doc_count"].map(lambda v: cmap(norm(v))[:3] + (.5,))

gdf = geopandas.GeoDataFrame(df)

fig, ax = plt.subplots(figsize=(10, 10))
# plot a shapefile from https://biogeo.ucdavis.edu/data/gadm3.6
geopandas.read_file("cache/gadm36_DEU_1.shp").plot(ax=ax, color="#e0e0e0")

gdf.plot(
    c=gdf["color"], markersize=gdf["location.doc_count"] / 3,
    aspect=1.3,
    ax=ax,
)
_images/plotting-maps_35_1.png