Commit 5bc2de9c authored by alexlag's avatar alexlag
Browse files

use JSON in texterra NLP

parent 4667fe35
require 'json'
require 'httparty'
require 'nori'
require_relative './api_error'
......@@ -17,20 +18,42 @@ class IsprasAPI
end
end
def GET(path = '', params = {})
options = { query: params }
def GET(path = '', params = {}, format=:xml)
options = {
headers: headers(format),
query: params
}
response = self.class.get "/#{path}", options
response.code == 200 ? response.parsed_response : check_error(response)
end
def POST(path = '', params = {}, form = {})
options = { query: params, body: form }
def POST(path = '', params = {}, body = {}, format=:xml)
options = {
headers: headers(format),
query: params,
body: body
}
response = self.class.post "/#{path}", options
response.code == 200 ? response.parsed_response : check_error(response)
end
private
def headers(format)
case(format)
when :json
{
'Accept' => 'application/json'
}
when :xml
{
'Accept' => 'application/xml'
}
else
{}
end
end
def check_error(response)
fail ApiError, "#{response.code} Error occured"
end
......
......@@ -5,7 +5,7 @@ module TexterraNLP
# Detects language of given text
#
# @param [String] text Text to process
# @return [Array] Texterra annotations
# @return [Hash] Texterra document
def language_detection_annotate(text)
preset_nlp(:languageDetection, text)
end
......@@ -13,7 +13,7 @@ module TexterraNLP
# Detects boundaries of sentences in a given text
#
# @param [String] text Text to process
# @return [Array] Texterra annotations
# @return [Hash] Texterra document
def sentence_detection_annotate(text)
preset_nlp(:sentenceDetection, text)
end
......@@ -21,7 +21,7 @@ module TexterraNLP
# Detects all tokens (minimal significant text parts) in a given text
#
# @param [String] text Text to process
# @return [Array] Texterra annotations
# @return [Hash] Texterra document
def tokenization_annotate(text)
preset_nlp(:tokenization, text)
end
......@@ -29,7 +29,7 @@ module TexterraNLP
# Detects lemma of each word of a given text
#
# @param [String] text Text to process
# @return [Array] Texterra annotations
# @return [Hash] Texterra document
def lemmatization_annotate(text)
preset_nlp(:lemmatization, text)
end
......@@ -37,7 +37,7 @@ module TexterraNLP
# Detects part of speech tag for each word of a given text
#
# @param [String] text Text to process
# @return [Array] Texterra annotations
# @return [Hash] Texterra document
def pos_tagging_annotate(text)
preset_nlp(:posTagging, text)
end
......@@ -45,7 +45,7 @@ module TexterraNLP
# Tries to correct disprints and other spelling errors in a given text
#
# @param [String] text Text to process
# @return [Array] Texterra annotations
# @return [Hash] Texterra document
def spelling_correction_annotate(text)
preset_nlp(:spellingCorrection, text)
end
......@@ -53,7 +53,7 @@ module TexterraNLP
# Finds all named entities occurences in a given text
#
# @param [String] text Text to process
# @return [Array] Texterra annotations
# @return [Hash] Texterra document
def named_entities_annotate(text)
preset_nlp(:namedEntities, text)
end
......@@ -61,7 +61,7 @@ module TexterraNLP
# Extracts not overlapping terms within a given text; term is a textual representation for some concept of the real world
#
# @param [String] text Text to process
# @return [Array] Texterra annotations
# @return [Hash] Texterra document
def term_detection_annotate(text)
preset_nlp(:termDetection, text)
end
......@@ -69,7 +69,7 @@ module TexterraNLP
# Detects the most appropriate meanings (concepts) for terms occurred in a given text
#
# @param [String] text Text to process
# @return [Array] Texterra annotations
# @return [Hash] Texterra document
def disambiguation_annotate(text)
preset_nlp(:disambiguation, text)
end
......@@ -78,7 +78,7 @@ module TexterraNLP
# This service extracts a set of key concepts for a given text
#
# @param [String] text Text to process
# @return [Array] Texterra annotations
# @return [Hash] Texterra document
def key_concepts_annotate(text)
preset_nlp(:keyConcepts, text)
end
......@@ -88,7 +88,7 @@ module TexterraNLP
# If no domain from this list has been detected, the text is assumed to be no domain, or general domain
#
# @param [String] text Text to process
# @return [Array] Texterra annotations
# @return [Hash] Texterra document
def domain_detection_annotate(text)
preset_nlp(:domainDetection, text)
end
......@@ -96,7 +96,7 @@ module TexterraNLP
# Detects whether the given text is subjective or not
#
# @param [String] text Text to process
# @return [Array] Texterra annotations
# @return [Hash] Texterra document
def subjectivity_detection_annotate(text)
preset_nlp(:subjectivityDetection, text)
end
......@@ -104,7 +104,7 @@ module TexterraNLP
# Detects whether the given text has positive, negative or no sentiment
#
# @param [String] text Text to process
# @return [Array] Texterra annotations
# @return [Hash] Texterra document
def polarity_detection_annotate(text)
preset_nlp(:polarityDetection, text)
end
......@@ -115,21 +115,22 @@ module TexterraNLP
#
# @param [String] text Text to process
# @param [String] domain Domain for polarity detection
# @return [Array] Texterra annotations
# @return [Hash] Texterra document
def domain_polarity_detection_annotate(text, domain = '')
specs = NLP_SPECS[:domainPolarityDetection]
domain = "(#{domain})" unless domain.empty?
result = POST(specs[:path] % domain, specs[:params], text: text)[:nlp_document][:annotations][:i_annotation]
return [] if result.nil?
result = [].push result unless result.is_a? Array
result.map { |e| assign_text(e, text) }
result = POST(specs[:path] % domain, specs[:params], {text: text}, :json)
result[:annotations].each do |key, value|
value.map! { |an| assign_text(an, text) }
end
result
end
# Detects Twitter-specific entities: Hashtags, User names, Emoticons, URLs.
# And also: Stop-words, Misspellings, Spelling suggestions, Spelling corrections
#
# @param [String] text Text to process
# @return [Array] Texterra annotations
# @return [Hash] Texterra document
def tweet_normalization(text)
preset_nlp(:tweetNormalization, text)
end
......@@ -137,11 +138,13 @@ module TexterraNLP
# Detects Syntax relations in text. Only works for russian texts
#
# @param [String] text Text to process
# @return [Array] Texterra annotations
# @return [Hash] Texterra document
def syntax_detection(text)
preset_nlp(:syntaxDetection, text).each do |an|
an[:value][:parent_token] = assign_text(an[:value][:parent_token], text) if an[:value] && an[:value][:parent_token]
result = preset_nlp(:syntaxDetection, text)
result[:annotations][:'syntax-relation'].each do |an|
an[:value][:parent] = assign_text(an[:value][:parent], text) if an[:value] && an[:value][:parent]
end
result
end
private
......@@ -149,10 +152,11 @@ module TexterraNLP
# Utility NLP part method
def preset_nlp(methodName, text)
specs = NLP_SPECS[methodName]
result = POST(specs[:path], specs[:params], text: text)[:nlp_document][:annotations][:i_annotation]
return [] if result.nil?
result = [].push result unless result.is_a? Array
result.map { |an| assign_text(an, text) }
result = POST(specs[:path], specs[:params], {text: text}, :json)
result[:annotations].each do |key, value|
value.map! { |an| assign_text(an, text) }
end
result
end
# Utility text assignement for annotation
......
......@@ -38,20 +38,15 @@ class TexterraAPI < IsprasAPI
# @param [String] text Text to process
# @return [Array] Array of weighted key concepts
def key_concepts(text)
key_concepts = key_concepts_annotate(text)[0][:value][:concepts_weights][:entry] || []
key_concepts = [].push key_concepts unless key_concepts.is_a? Array
key_concepts.map do |kc|
kc[:concept][:weight] = kc[:double]
kc[:concept]
end
key_concepts = key_concepts_annotate(text)[:annotations][:keyconcepts][0][:value] || []
end
# Detects whether the given text has positive, negative or no sentiment
#
# @param [String] text Text to process
# @return [Array] Sentiment of the text
# @return [String] Sentiment of the text
def sentiment_analysis(text)
polarity_detection_annotate(text)[0][:value].to_s || 'NEUTRAL'
polarity_detection_annotate(text)[:annotations][:polarity][0][:value].to_s || 'NEUTRAL'
rescue NoMethodError
'NEUTRAL'
end
......@@ -66,9 +61,11 @@ class TexterraAPI < IsprasAPI
def domain_sentiment_analysis(text, domain = '')
used_domain = 'general'
sentiment = 'NEUTRAL'
(domain_polarity_detection_annotate(text, domain) || []).each do |an|
sentiment = an[:value] if an[:@class].include? 'SentimentPolarity'
used_domain = an[:value] if an[:@class].include? 'DomainAnnotation'
annotations = domain_polarity_detection_annotate(text, domain)[:annotations]
begin
used_domain = annotations[:domain][0][:value]
sentiment = annotations[:polarity][0][:value]
rescue NoMethodError
end
{
domain: used_domain,
......@@ -81,7 +78,7 @@ class TexterraAPI < IsprasAPI
# @param [String] text Text to process
# @return [Array] Texterra annotations
def disambiguation(text)
disambiguation_annotate(text)
disambiguation_annotate(text)[:annotations][:'disambiguated-phrase']
end
def custom_query(path, query, form = nil)
......
......@@ -44,61 +44,123 @@ class TestTexterraAPI < Minitest::Test
end
def test_tweet_normalization
assert_instance_of Array, @texterra.tweet_normalization(@en_tweet)
assert_instance_of Array, @texterra.tweet_normalization(@ru_tweet)
res = @texterra.tweet_normalization(@en_tweet)
assert_instance_of Hash, res
assert_equal @en_tweet, res[:text]
res = @texterra.tweet_normalization(@ru_tweet)
assert_instance_of Hash, res
assert_equal @ru_tweet, res[:text]
end
def test_syntax_detection
assert_instance_of Array, @texterra.syntax_detection(@ru_text)
res = @texterra.syntax_detection(@ru_text)
assert_instance_of Hash, res
assert_equal @ru_text, res[:text]
end
def test_language_detection_annotate
assert_instance_of Array, @texterra.language_detection_annotate(@en_text)
assert_instance_of Array, @texterra.language_detection_annotate(@ru_text)
assert_instance_of Array, @texterra.language_detection_annotate(@en_tweet)
assert_instance_of Array, @texterra.language_detection_annotate(@ru_tweet)
res = @texterra.language_detection_annotate(@en_text)
assert_instance_of Hash, res
assert_equal @en_text, res[:text]
res = @texterra.language_detection_annotate(@ru_text)
assert_instance_of Hash, res
assert_equal @ru_text, res[:text]
res = @texterra.language_detection_annotate(@en_tweet)
assert_instance_of Hash, res
assert_equal @en_tweet, res[:text]
res = @texterra.language_detection_annotate(@ru_tweet)
assert_instance_of Hash, res
assert_equal @ru_tweet, res[:text]
end
def test_sentence_detection_annotate
assert_instance_of Array, @texterra.sentence_detection_annotate(@en_text)
assert_instance_of Array, @texterra.sentence_detection_annotate(@ru_text)
assert_instance_of Array, @texterra.sentence_detection_annotate(@en_tweet)
assert_instance_of Array, @texterra.sentence_detection_annotate(@ru_tweet)
res = @texterra.sentence_detection_annotate(@en_text)
assert_instance_of Hash, res
assert_equal @en_text, res[:text]
res = @texterra.sentence_detection_annotate(@ru_text)
assert_instance_of Hash, res
assert_equal @ru_text, res[:text]
res = @texterra.sentence_detection_annotate(@en_tweet)
assert_instance_of Hash, res
assert_equal @en_tweet, res[:text]
res = @texterra.sentence_detection_annotate(@ru_tweet)
assert_instance_of Hash, res
assert_equal @ru_tweet, res[:text]
end
def test_tokenization_annotate
assert_instance_of Array, @texterra.tokenization_annotate(@en_text)
assert_instance_of Array, @texterra.tokenization_annotate(@ru_text)
assert_instance_of Array, @texterra.tokenization_annotate(@en_tweet)
assert_instance_of Array, @texterra.tokenization_annotate(@ru_tweet)
res = @texterra.tokenization_annotate(@en_text)
assert_instance_of Hash, res
assert_equal @en_text, res[:text]
res = @texterra.tokenization_annotate(@ru_text)
assert_instance_of Hash, res
assert_equal @ru_text, res[:text]
res = @texterra.tokenization_annotate(@en_tweet)
assert_instance_of Hash, res
assert_equal @en_tweet, res[:text]
res = @texterra.tokenization_annotate(@ru_tweet)
assert_instance_of Hash, res
assert_equal @ru_tweet, res[:text]
end
def test_lemmatization_annotate
assert_instance_of Array, @texterra.lemmatization_annotate(@en_text)
assert_instance_of Array, @texterra.lemmatization_annotate(@ru_text)
assert_instance_of Array, @texterra.lemmatization_annotate(@en_tweet)
assert_instance_of Array, @texterra.lemmatization_annotate(@ru_tweet)
res = @texterra.lemmatization_annotate(@en_text)
assert_instance_of Hash, res
assert_equal @en_text, res[:text]
res = @texterra.lemmatization_annotate(@ru_text)
assert_instance_of Hash, res
assert_equal @ru_text, res[:text]
res = @texterra.lemmatization_annotate(@en_tweet)
assert_instance_of Hash, res
assert_equal @en_tweet, res[:text]
res = @texterra.lemmatization_annotate(@ru_tweet)
assert_instance_of Hash, res
assert_equal @ru_tweet, res[:text]
end
def test_pos_tagging_annotate
assert_instance_of Array, @texterra.pos_tagging_annotate(@en_text)
assert_instance_of Array, @texterra.pos_tagging_annotate(@ru_text)
assert_instance_of Array, @texterra.pos_tagging_annotate(@en_tweet)
assert_instance_of Array, @texterra.pos_tagging_annotate(@ru_tweet)
res = @texterra.pos_tagging_annotate(@en_text)
assert_instance_of Hash, res
assert_equal @en_text, res[:text]
res = @texterra.pos_tagging_annotate(@ru_text)
assert_instance_of Hash, res
assert_equal @ru_text, res[:text]
res = @texterra.pos_tagging_annotate(@en_tweet)
assert_instance_of Hash, res
assert_equal @en_tweet, res[:text]
res = @texterra.pos_tagging_annotate(@ru_tweet)
assert_instance_of Hash, res
assert_equal @ru_tweet, res[:text]
end
def test_named_entities_annotate
assert_instance_of Array, @texterra.named_entities_annotate(@en_text)
assert_instance_of Array, @texterra.named_entities_annotate(@ru_text)
assert_instance_of Array, @texterra.named_entities_annotate(@en_tweet)
assert_instance_of Array, @texterra.named_entities_annotate(@ru_tweet)
res = @texterra.named_entities_annotate(@en_text)
assert_instance_of Hash, res
assert_equal @en_text, res[:text]
res = @texterra.named_entities_annotate(@ru_text)
assert_instance_of Hash, res
assert_equal @ru_text, res[:text]
res = @texterra.named_entities_annotate(@en_tweet)
assert_instance_of Hash, res
assert_equal @en_tweet, res[:text]
res = @texterra.named_entities_annotate(@ru_tweet)
assert_instance_of Hash, res
assert_equal @ru_tweet, res[:text]
end
def test_subjectivity_detection_annotate
assert_instance_of Array, @texterra.subjectivity_detection_annotate(@en_text)
assert_instance_of Array, @texterra.subjectivity_detection_annotate(@ru_text)
assert_instance_of Array, @texterra.subjectivity_detection_annotate(@en_tweet)
assert_instance_of Array, @texterra.subjectivity_detection_annotate(@ru_tweet)
res = @texterra.subjectivity_detection_annotate(@en_text)
assert_instance_of Hash, res
assert_equal @en_text, res[:text]
res = @texterra.subjectivity_detection_annotate(@ru_text)
assert_instance_of Hash, res
assert_equal @ru_text, res[:text]
res = @texterra.subjectivity_detection_annotate(@en_tweet)
assert_instance_of Hash, res
assert_equal @en_tweet, res[:text]
res = @texterra.subjectivity_detection_annotate(@ru_tweet)
assert_instance_of Hash, res
assert_equal @ru_tweet, res[:text]
end
def test_representation_terms
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment