Commit 0ffa8d64 authored by alexlag's avatar alexlag
Browse files

abstract api, TexterraNLP + some test

parent b6a4bb98
......@@ -32,3 +32,5 @@ build/
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
.rvmrc
.env
require 'rake/testtask'
Rake::TestTask.new do |t|
t.libs << 'test'
end
desc "Run tests"
task :default => :test
\ No newline at end of file
require "./lib/ispapi/version"
Gem::Specification.new do |s|
s.name = 'ispapi'
s.version = Version.current
s.date = Version.current_date
s.files = `git ls-files`.split($\)
s.require_paths = ["lib"]
s.add_runtime_dependency 'httparty', '~> 0.13.3'
s.add_runtime_dependency 'nori', '~> 2.4.0'
s.add_development_dependency 'rake', '~> 10.4.2'
s.add_development_dependency 'minitest', '~> 5.5.0'
s.add_development_dependency 'dotenv', '~> 1.0.2'
s.summary = "ISPRAS API Ruby SDK"
s.authors = ["Alexey Laguta"]
s.email = 'laguta@ispras.ru'
end
\ No newline at end of file
require_relative 'ispapi/texterra_api'
\ No newline at end of file
class ApiError < StandardError
end
\ No newline at end of file
require 'httparty'
require 'nori'
require_relative 'api_error'
class IsprasAPI
include HTTParty
# debug_output $stdout
ROOT_URL = 'api.ispras.ru/%s/%s'
parser Proc.new { |data| data }
def initialize(key, name, ver)
if key && key.size == 40
self.class.base_uri ROOT_URL % [name, ver]
self.class.default_params apikey: key
@nori = Nori.new(parser: :rexml, convert_tags_to: lambda { |tag| tag.snakecase.to_sym })
else
raise ApiError, 'Please provide proper apikey'
end
end
def GET(path='', params={})
options = { query: params }
response = self.class.get "/#{path}", options
check_error response unless response.code == 200
hash = @nori.parse response.body
end
def POST(path='', params={}, form={})
options = { query: params, body: form }
response = self.class.post "/#{path}", options
check_error response unless response.code == 200
hash = @nori.parse response.body
end
private
def check_error(response)
raise ApiError, "#{response.code} Error occured"
end
end
\ No newline at end of file
require_relative './kbm_specs'
module TexterraKBM
end
\ No newline at end of file
module TexterraKBMSpecs
# Path and parameters for preset KBM queries
KBMSpecs = {
termPresence: {
path: 'representation/%s/contained',
params: {}
},
termInfoMeasure: {
path: 'representation/%s/infomeasure',
params: {}
},
termMeanings: {
path: 'representation/%s/meanings',
params: {}
},
termCommonness: {
path: 'representation/%s/commonness/%s',
params: {}
},
neighbours: {
path: 'walker/%s/neighbours%s',
params: {}
},
similarityGraph: {
path: 'similarity/%s/graph',
params: {}
},
allPairsSimilarity: {
path: 'similarity/%s/summed/%s',
params: {}
},
similarityToVirtualArticle: {
path: 'similarity/%s/toVirtualArticle/%s',
params: {}
},
similarityBetweenVirtualArticle: {
path: 'similarity/%s/betweenVirtualArticles/%s',
params: {}
},
similarOverFirstNeighbours: {
path: 'similarity/%s/similar/neighbours',
params: {}
},
similarOverFilteredNeighbours: {
path: 'similarity/%s/similar/all',
params: {}
}
}
end
\ No newline at end of file
require_relative './nlp_specs'
module TexterraNLP
include TexterraNLPSpecs
# Detects language of given text
#
# @param text [String] text to process
# @return [Array] Texterra annotations
def language_detection_annotate(text)
presetNLP(:languageDetection, text)
end
# Detects boundaries of sentences in a given text
#
# @param text [String] text to process
# @return [Array] Texterra annotations
def sentence_detection(text)
presetNLP(:sentenceDetection, text)
end
# Detects all tokens (minimal significant text parts) in a given text
#
# @param text [String] text to process
# @return [Array] Texterra annotations
def tokenization_annotate(text)
presetNLP(:tokenization, text)
end
# Detects lemma of each word of a given text
#
# @param text [String] text to process
# @return [Array] Texterra annotations
def lemmatization_annotate(text)
presetNLP(:lemmatization, text)
end
# Detects part of speech tag for each word of a given text
#
# @param text [String] text to process
# @return [Array] Texterra annotations
def pos_tagging_annotate(text)
presetNLP(:posTagging, text)
end
# Tries to correct disprints and other spelling errors in a given text
#
# @param text [String] text to process
# @return [Array] Texterra annotations
def spelling_correction_annotate(text)
presetNLP(:spellingCorrection, text)
end
# Finds all named entities occurences in a given text
#
# @param text [String] text to process
# @return [Array] Texterra annotations
def named_entities_annotate(text)
presetNLP(:namedEntities, text)
end
# Extracts not overlapping terms within a given text; term is a textual representation for some concept of the real world
#
# @param text [String] text to process
# @return [Array] Texterra annotations
def term_detection_annotate(text)
presetNLP(:termDetection, text)
end
# Detects the most appropriate meanings (concepts) for terms occurred in a given text
#
# @param text [String] text to process
# @return [Array] Texterra annotations
def disambiguation_annotate(text)
presetNLP(:disambiguation, text)
end
# Key concepts are the concepts providing short (conceptual) and informative text description.
# This service extracts a set of key concepts for a given text
#
# @param text [String] text to process
# @return [Array] Texterra annotations
def key_concepts_annotate(text)
presetNLP(:keyConcepts, text)
end
# Detects the most appropriate domain for the given text.
# Currently only 2 specific domains are supported: 'movie' and 'politics'
# If no domain from this list has been detected, the text is assumed to be no domain, or general domain
#
# @param text [String] text to process
# @return [Array] Texterra annotations
def domain_detection_annotate(text)
presetNLP(:domainDetection, text)
end
# Detects whether the given text is subjective or not
#
# @param text [String] text to process
# @return [Array] Texterra annotations
def subjectivity_detection_annotate(text)
presetNLP(:subjectivityDetection, text)
end
# Detects whether the given text has positive, negative or no sentiment
#
# @param text [String] text to process
# @return [Array] Texterra annotations
def polarity_detection_annotate(text)
presetNLP(:polarityDetection, text)
end
# Extracts aspect-sentiment pairs from the given text. Currently only movie domain is supported
#
# @param text [String] text to process
# @return [Array] Texterra annotations
def aspect_extraction_annotate(text)
presetNLP(:aspectExtraction, text)
end
# Detects whether the given text has positive, negative, or no sentiment, with respect to domain.
# If domain isn't provided, Domain detection is applied, this way method tries to achieve best results.
# If no domain is detected general domain algorithm is applied
#
# @param text [String] text to process
# @return [Array] Texterra annotations
def domain_polarity_detection_annotate(text, domain='')
specs = NLPSpecs[:domainPolarityDetection]
domain = '(%s)' % domain unless domain.empty?
result = POST(specs[:path] % domain, specs[:params], {text: text})[:nlp_document][:annotations][:i_annotation]
result = [].push result unless result.is_a? Array
result.each do |e|
st, en = e[:start].to_i, e[:end].to_i
e[:text] = e[:annotated_text] = text[st..en]
end
end
# Detects Twitter-specific entities: Hashtags, User names, Emoticons, URLs.
# And also: Stop-words, Misspellings, Spelling suggestions, Spelling corrections
#
# @param text [String] text to process
# @return [Array] Texterra annotations
def tweet_normalization(text)
presetNLP(:tweetNormalization, text)
end
private
def presetNLP(methodName, text)
# Utility NLP part method
specs = NLPSpecs[methodName]
result = POST(specs[:path], specs[:params], {text: text})[:nlp_document][:annotations][:i_annotation]
result = [].push result unless result.is_a? Array
result.each do |e|
st, en = e[:start].to_i, e[:end].to_i
e[:text] = e[:annotated_text] = text[st..en]
end
end
end
\ No newline at end of file
module TexterraNLPSpecs
# Path and parameters for preset NLP queries
NLPSpecs = {
languageDetection: {
path: 'nlp/ru.ispras.texterra.core.nlp.pipelines.LanguageDetectionPipeline',
params: {
:class => 'ru.ispras.texterra.core.nlp.datamodel.Language',
filtering: 'KEEPING'
}
},
sentenceDetection: {
path: 'nlp/sentence',
params: {
:class => 'ru.ispras.texterra.core.nlp.datamodel.Sentence',
filtering: 'KEEPING'
}
},
tokenization: {
path: 'nlp/token',
params: {
:class => 'ru.ispras.texterra.core.nlp.datamodel.Token',
filtering: 'KEEPING'
}
},
lemmatization: {
path: 'nlp/lemma',
params: {
:class => 'ru.ispras.texterra.core.nlp.datamodel.Lemma',
filtering: 'KEEPING'
}
},
posTagging: {
path: 'nlp/pos',
params: {
:class => 'ru.ispras.texterra.core.nlp.datamodel.pos.IPOSToken',
filtering: 'KEEPING'
}
},
spellingCorrection: {
path: 'nlp/ru.ispras.texterra.core.nlp.annotators.spelling.SpellingCorrector',
params: {
:class => 'ru.ispras.texterra.core.nlp.datamodel.SpellingCorrection',
filtering: 'KEEPING'
}
},
namedEntities: {
path: 'nlp/ru.ispras.texterra.core.nlp.pipelines.NETaggingPipeline',
params: {
:class => 'ru.ispras.texterra.core.nlp.datamodel.ne.NamedEntityToken',
filtering: 'KEEPING'
}
},
termDetection: {
path: 'nlp/ru.ispras.texterra.core.nlp.pipelines.TermDetectionPipeline',
params: {
:class => 'ru.ispras.texterra.core.nlp.datamodel.Frame',
filtering: 'KEEPING'
}
},
disambiguation: {
path: 'nlp/ru.ispras.texterra.core.nlp.pipelines.DisambiguationPipeline',
params: {
:class => 'ru.ispras.texterra.core.nlp.datamodel.DisambiguatedPhrase',
filtering: 'KEEPING'
}
},
keyConcepts: {
path: 'nlp/ru.ispras.texterra.core.nlp.pipelines.KeyConceptsPipeline',
params: {
:class => 'ru.ispras.texterra.core.nlp.datamodel.KeyconceptsSemanticContext',
filtering: 'KEEPING'
}
},
domainDetection: {
path: 'nlp/domain',
params: {
:class => 'domain',
filtering: 'KEEPING'
}
},
subjectivityDetection: {
path: 'nlp/subjectivity',
params: {
:class => 'ru.ispras.texterra.core.nlp.datamodel.SentimentSubjectivity',
filtering: 'KEEPING'
}
},
polarityDetection: {
path: 'nlp/polarity',
params: {
:class => 'ru.ispras.texterra.core.nlp.datamodel.SentimentPolarity',
filtering: 'KEEPING'
}
},
aspectExtraction: {
path: 'nlp/aspectsentiment',
params: {
:class => 'aspect-sentiment',
filtering: 'KEEPING'
}
},
domainPolarityDetection: {
path: 'nlp/domainpolarity%s',
params: {
:class => [ 'domain', 'sentiment-polarity' ],
filtering: 'KEEPING'
}
},
tweetNormalization: {
path: 'nlp/twitterdetection',
params: {
:class => ['sentence', 'language', 'token'],
filtering: 'REMOVING'
}
}
}
end
\ No newline at end of file
require_relative './ispras_api'
require_relative './texterra/nlp'
require_relative './texterra/kbm'
class TexterraAPI < IsprasAPI
# This class provides methods to work with Texterra REST via OpenAPI, including NLP and EKB methods and custom queriesю
# Note that NLP methods return annotations only
include TexterraNLP, TexterraKBM
disable_rails_query_string_format
def initialize(key, name, ver='v3.1')
name='texterra' if name.nil? || name.empty?
ver='v3.1' if ver.nil? || ver.empty?
super(key, name, ver)
end
# Section of NLP methods
# NLP basic helper methods
# Key concepts are the concepts providing short (conceptual) and informative text description.
# This service extracts a set of key concepts for a given text
#
# @param text [String] text to process
# @return [Array] list of weighted key concepts
def key_concepts(text)
key_concepts = key_concepts_annotate(text)[0][:value][:concepts_weights][:entry] || []
key_concepts.map { |kc|
kc[:concept][:weight] = kc[:double]
kc[:concept]
}
end
# Detects whether the given text has positive, negative or no sentiment
#
# @param text [String] text to process
# @return [Array] Sentiment of the text
def sentiment_analysis(text)
begin
polarity_detection_annotate(text)[0][:value].to_s || 'NEUTRAL'
rescue NoMethodError
'NEUTRAL'
end
end
# Detects whether the given text has positive, negative, or no sentiment, with respect to domain.
# If domain isn't provided, Domain detection is applied, this way method tries to achieve best results.
# If no domain is detected general domain algorithm is applied
#
# @param text [String] text to process
# @param domain [String] domain to use. Can be empty
# @return [Hash] used :domain and detected :polarity
def domain_sentiment_analysis(text, domain='')
used_domain = 'general'
sentiment = 'NEUTRAL'
(domain_polarity_detection_annotate(text, domain) || []).each { |an|
sentiment = an[:value] if an[:@class].include? 'SentimentPolarity'
used_domain = an[:value] if an[:@class].include? 'DomainAnnotation'
}
{
domain: used_domain,
polarity: sentiment
}
end
# Extracts aspect-sentiment pairs from the given text. Currently only movie domain is supported
#
# @param text [String] text to process
# @return [Array] list of found aspects
def aspect_extraction(text)
(aspect_extraction_annotate(text) || []).map do |asp|
{
text: as[:text],
aspect: as[:value][:aspect],
polarity: as[:value][:polarity]
}
end
end
# Detects the most appropriate meanings (concepts) for terms occurred in a given text
#
# @param text [String] text to process
# @return [Array] Texterra annotations
def disambiguation(text)
disambiguation_annotate(text)
end
private
def check_error(response)
hash = @nori.parse response.body
er_node = hash[:html][:body][:p].detect { |node| node.is_a? Hash and node[:b] == 'root cause' }
raise ApiError, er_node[:pre].gsub(/ru\.ispras.*:\s*/, '')
end
end
\ No newline at end of file
module Version
MAJOR = 0
MINOR = 0
PATCH = 1
PRE = nil
YEAR = "2015"
MONTH = "2"
DAY = "6"
def self.to_s
[MAJOR, MINOR, PATCH, PRE].compact.join(".")
end
def self.current
to_s
end
def self.current_date
"#{YEAR}-#{MONTH}-#{DAY}"
end
def self.version_to_h(version)
version_array = version.split(/\./)
version_hash = {}
version_hash[:major] = version_array[0]
version_hash[:minor] = version_array[1]
version_hash[:patch] = version_array[2]
version_hash[:pre] = version_array[3]
version_hash
end
end
\ No newline at end of file
require 'minitest/autorun'
require 'dotenv'
Dotenv.load
require_relative '../lib/ispapi'
class TestTexterraAPI < Minitest::Test
def setup
@texterra = TexterraAPI.new ENV['KEY'], ENV['SERVICE_NAME'], ENV['SERVICE_VERSION']
@en_text = 'Apple today updated iMac to bring numerous high-performance enhancements to the =s leading all-in-one desktop. iMac now features fourth-generation Intel Core processors, new graphics, and next-generation Wi-Fi. In addition, it now supports PCIe-based flash storage, making its Fusion Drive and all-flash storage options up to 50 percent faster than the previous generation'
@ru_text = 'Первые в этом году переговоры министра иностранных дел России Сергея Лаврова и госсекретаря США Джона Керри, длившиеся 1,5 часа, завершились в Мюнхене.'
@en_tweet = 'mentioning veterens care which Mccain has voted AGAINST - SUPER GOOOOD point Obama+1 #tweetdebate'
@ru_tweet = 'В мастерской готовят пушку и автомобили 1940-х годов, для участия в Параде Победы в Ново-Переделкино.'
end
def test_key_concepts
assert_instance_of Array, @texterra.key_concepts(@en_text)
assert_instance_of Array, @texterra.key_concepts(@ru_text)
end
def test_disambiguation
assert_instance_of Array, @texterra.disambiguation(@en_text)
assert_instance_of Array, @texterra.disambiguation(@ru_text)
end
def test_sentiment_analysis
assert_instance_of String, @texterra.sentiment_analysis(@en_text)
assert_instance_of String, @texterra.sentiment_analysis(@ru_text)
assert_instance_of String, @texterra.sentiment_analysis(@en_tweet)
assert_instance_of String, @texterra.sentiment_analysis(@ru_tweet)
end
def test_tweet_normalization
assert_instance_of Array, @texterra.tweet_normalization(@en_tweet)
assert_raises ApiError do
@texterra.tweet_normalization(@ru_text)
end
end
end
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment