from elasticsearch import Elasticsearch, helpers from elasticsearch_dsl import Search, Q import json es = Elasticsearch() index_name = 'stemmed_py' if es.indices.exists(index=index_name): print("{} already exists; let's get rid of it...".format(index_name)) es.indices.delete(index=index_name) es.indices.create(index=index_name, body={ 'settings': { 'analysis': { 'analyzer': { 'default': { 'type': 'standard' } } } } }) # if we want to do one-off indexing of a relatively small number of documents: for line in open('../grants.first100.json'): this_grant = json.loads(line) res = es.index(index=index_name, body=this_grant, doc_type='grant', id=this_grant['app_id']) print(res['created']) print(res['created']) # if we've got a larger number of documents, we want to use the bulk API: # def grant_source(idx_name): # for line in open('grants.first100.json'): # this_grant = json.loads(line) # yield {'_index': idx_name, '_id': this_grant['app_id'], '_type':'grant', '_source': this_grant} # # helpers.bulk(es, grant_source(index_name)) # helpers.bulk takes a Python iterable that produces action specifications, so we can stream a large data # set and not have to load it all into memory at once. # either way, we'll want to refresh the index es.indices.refresh(index=index_name) ####################### Time to query! # we are using the query DSL, which is more idiomatic than the straight-up ES API # for details, see https://elasticsearch-dsl.readthedocs.io/en/latest/ this_search = Search(using=es, index=index_name).query("match", proj_title="study") response = this_search.execute() print("There were {} results!".format(response.hits.total)) for hit in response: print(hit.proj_title)