Commit 040b0277 authored by Мария Григорьева's avatar Мария Григорьева
Browse files

Upload New File

parent 0cb1ebab
import logging
import pandas as pd
from elasticsearch import Elasticsearch
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="specify_your_app_name_here")
LOG_FILENAME = 'affiliations.log'
logger = logging.getLogger('AffiliationsLogger')
logger.setLevel(logging.DEBUG)
logging.basicConfig(filename=LOG_FILENAME)
es = Elasticsearch([{'host': '127.0.0.1', 'port': 9200}])
INDEX_NAME = "russian_affiliations_with_coords"
AFFILIATIONS = "inspire-beta-affiliations"
match_all = {
"query":{
"match_all" : {}
}
}
def scroller(es, index, request, pagesize=250, scroll_timeout="2m", **kwargs):
is_first = True
while True:
if is_first:
result = es.search(index=index, scroll="2m", size=pagesize, **kwargs, body=request)
is_first = False
else:
result = es.scroll(body={
"scroll_id": scroll_id,
"scroll": scroll_timeout
})
scroll_id = result["_scroll_id"]
hits = result["hits"]["hits"]
if not hits:
break
for item in hits:
yield item
def get_location(address, city, code):
try:
location = geolocator.geocode(address,
addressdetails=True,
exactly_one=True,
country_codes=[code],
timeout=30)
if location is not None:
logger.info((location.latitude, location.longitude))
return location.latitude, location.longitude
else:
location = geolocator.geocode(city,
exactly_one=True,
country_codes=[code],
timeout=30)
if location is not None:
logger.info((location.latitude, location.longitude))
return location.latitude, location.longitude
else:
return None, None
except GeocoderTimedOut as e:
logger.error(f'geocode failed on input {address}')
logger.error(e.message)
def main():
affiliations_list = []
for entry in scroller(es, AFFILIATIONS, match_all):
# affiliation record ID
id = str(entry['_source']['id'])
# search for geolocation
if 'addresses' in entry['_source']['metadata']:
address = entry['_source']['metadata']['addresses'][0]
latitude, longitude = None, None
if 'latitude' in address:
latitude = address['latitude']
if 'longitude' in address:
longitude = address['longitude']
if latitude is None and longitude is None:
addr = entry['_source']['metadata']['addresses'][0]
code = addr['country_code']
address = addr['postal_address'][-3:]
city = addr['cities'][0] if 'cities' in addr else code
latitude, longitude = get_location(address, city, code)
affiliations_list.append({'id': id,
'lat': latitude,
'lon': longitude})
affiliations = pd.DataFrame(affiliations_list).set_index('id')
affiliations.to_csv('affiliations_geoloc.csv')
if __name__ == '__main__':
main()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment