Commit 5064adf2 authored by Евгений Третьяков's avatar Евгений Третьяков
Browse files

Add new file

parent d4bf6b02
import json
import os
from elasticsearch import Elasticsearch
def get_grants(file_path):
with open(file_path, "r", encoding="utf-8") as file:
data = {int(line.split()[0]): [number for number in line.split()[1].split(",") if number]
for line in file.read().split("\n") if line}
return data
def get_addresses(affiliation_id):
affiliation_path = f"/mnt/vdb/inspireBetaDownload/russian-affiliations-query/" \
f"affiliations_jsons/{affiliation_id}.json"
with open(affiliation_path, "r", encoding="utf-8") as file:
affiliation = json.loads(file.read())
return affiliation["metadata"]["addresses"]
def get_record(recid):
record_path = f"/mnt/vdb/inspireBetaDownload/russian-affiliations-query/records_jsons/{recid}.json"
with open(record_path, "r", encoding="utf-8") as file:
record = json.loads(file.read())
try:
for author in record["metadata"]["authors"]:
for r in author["affiliations"]:
affiliation_id = int(r["record"].get("$ref").split("/")[-1])
addresses = get_addresses(affiliation_id)
for address in addresses:
location = {
"lon": address["longitude"],
"lat": address["latitude"]
}
address["location"] = location
r["record"]["addresses"] = addresses
except ValueError:
print("ValueError")
except KeyError:
print("KeyError")
return record
def load_data(folder):
recids = [int(file_name.split(".")[0]) for file_name in os.listdir(folder)]
es = Elasticsearch()
mapping = {
"mappings": {
"properties": {
"metadata.authors.affiliations.record.addresses.location": {
"type": "geo_point"
}
}
}
}
es.indices.create(index="russian_affiliations_with_coords", body=mapping)
grants = get_grants("source/grants_numbers/grants_list.txt")
for i, recid in enumerate(recids, 1):
record = get_record(recid)
record["grants"] = grants.get(recid, [])
file_path = f"/mnt/vdb/inspireBetaDownload/russian-affiliations-query/records_with_coordinates/{recid}.json"
with open(file_path, "w", encoding="utf-8") as file:
file.write(json.dumps(record))
print(f"Uploading {i} of {len(recids)}")
result = es.index(index="russian_affiliations_with_coords", body=record)
print(result)
if __name__ == '__main__':
load_data("/mnt/vdb/inspireBetaDownload/russian-affiliations-query/records_jsons/")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment