
# ----------------------------------------------------------
# AdvaS Advanced Search 0.2.3
# advanced search algorithms implemented as a python module
#
# (C) 2002 - 2005 Frank Hofmann, Chemnitz, Germany
# email fh@efho.de
# ----------------------------------------------------------

# changed 2005-01-28

# other modules required by advas
import string
import re
import math

# import advas modules (separated due to growing complexity)
# basic functions 
from io import *
from basicStrings import *
from basicLists import *
from successorVariety import *
from descriptors import *
from phonetics import *
from synonym import *
from category import *
from ngram import *

# advanced functions ---------------------------------------

# - probability functions
def tf (text):
	"calculates the term frequency for a given text"

	# split this line into single words
	words = split_line (text)

	# count the words and create a list word:frequency
	list = count_words (words)

	# return list of words and their frequency
	return list

def tf_stop (text, stop_list):
	"calculates the term frequency and removes the items given in stop list"
	# text : a line of text
	# stop list : dictionary

	# get tf
	list = tf(text)

	# remove items given in stop list
	new_list = remove_items (list, stop_list)

	# return result
	return new_list

def idf (documents, word_list):
	"calculates the inverse document frequency for a given list of terms"

	# start with a empty list
	new_list = {}

	# get number of items
	key = word_list.keys()

	for item in key:
		# get frequency
		frequency = word_list[item]

		# calculate idf = ln(N/n):
		# N=number of documents
		# n=number of documents that contain term
		idf = math.log(float(documents)/float(frequency))

		# save idf
		new_list[item] = idf
	# end for

	# return new_list
	return new_list

# - stemming methods

def table_lookup_stemmer (term, stem_file):
	"return the term's stem given in a stem file"

	# stem file format (sorted alphabetically):
	# term : stem

	# define empty stem
	stem = ""

	# open stem file for reading, get contents
	contents = get_file_contents(stem_file)
	if (contents == -1):
		# can't read from given file name
		return stem
	# end if
	
	# search for given term (binary search)
	# get size of list
	right = len(contents) - 1
	# start with the 0est element
	left = 0

	while (right>=left):
		# calculate middle element for binary search
		middle = (right + left) / 2

		# extract item
		pattern = re.compile(r"\w+(?=[:\s,])")
		item_list = re.findall(pattern, contents[middle])

		# extract item from list
		item = item_list[0]

		# compare item and term
		result = cmp_strings(term, item)

		if (result==0):
			# match - return correct stem
			stem = item_list[1]
			return stem
		elif (result==-1):
			# mismatch, somewhere before
			right = middle - 1
		elif (result==1):
			# mismatch, somewhere later
			left = middle + 1
		# end if
	# end while
	
	# return stem
	return stem

def ngram_stemmer (word_list, size, equality):
	"reduces word_list according to the n-gram stemming method"

	# use return_list and stop_list for the terms to be removed, later
	return_list = []
	stop_list = []

	# calculate length and range
	list_length = len(word_list)
	outer_list_range = range(0, list_length)

	for i in outer_list_range:
		term1 = word_list[i]

		inner_list_range = range (0, i)

		for j in inner_list_range:
			term2 = word_list[j]

			# calculate n-gram value
			ngram_value = comp_ngrams(term1, term2, size)

			# compare
			degree = ngram_value - equality
			if (degree>0):
				# these terms are so similar that they can be conflated
				# remove the longer term, keep the shorter one
				if (len(term2)>len(term1)):
					stop_list.append(term2)
				else:
					stop_list.append(term1)
				# end if
			# end if
		# end for
	# end if

	# conflate the matrix
	# extract all the items which do not appear in stop_list
	# work with dictionaries instead of lists
	return_dict = convert_list_into_dictionary(word_list, 0)
	stop_dict = convert_list_into_dictionary(stop_list, 0)
	new_list = []

	for item in return_dict.keys():
		if not stop_dict.has_key(item):
			new_list.append(item)
		# end if
	# end for

	# return conflated word list
	return new_list

def successor_variety_stemmer (term, word_list, flag):
	"calculates the terms'stem according to the successor variety algorithm"

	# get basic list
	list = calc_succ_variety (word_list, flag)

	# examine given term
	# use peak-and-plateau method to found word boundaries
	term_length = len(term)
	term_range = range(1, term_length-1)

	# start here
	start=0

	# list of stems
	stem_list = []

	for i in term_range:
		# get slice
		word_slice = term[start:i+1]
		# print word_slice

		# check for a peak
		A = term[i-1]
		B = term[i]
		C = term[i+1]
		if (list[B]>list[A]) and (list[B]>list[C]):
			# save slice as a stem
			stem_list.append(word_slice)

			# adjust start
			start=i+1
		# end if
	# end for

	if (i<term_length):
		# still something left in buffer?
		word_slice = term[start:]
		stem_list.append(word_slice)
	# end if

	# return result
	return stem_list

# - phonetic methods / applied linguistics

def phonetic_code (term):
	"returns the term's phonetic code using different methods"

	# build an array to hold the phonetic code for each method
	phonetic_code_list = {
		"soundex": soundex(term),
		"metaphone": metaphone(term),
		"nysiis": nysiis(term),
		"caverphone": caverphone(term)
	}

	return phonetic_code_list

def is_language (text, stop_list, flag):
	"given text is written in a certain language"

	# old function - substituted by is_language_by_keywords()
	return is_language_by_keywords (text, stop_list, flag)

def is_language_by_keywords (text, stop_list, flag):
	"determine the language of a given text with the use of keywords"

	# text		: text to examine
	# stop_list	: list of items used to determine the language
	# flag		: 0=stop_list is a list, text is a string
	#		  1=stop_list is a file name, text is a string
	#		  2=stop_list is a list, text is a file name
	#		  3=stop_list is a file name, text is a file name

	# define return value
	value = 0

	if ((flag==1) or (flag==3)):
		# open given file for reading
		file_name = stop_list
		stop_list = get_file_contents(file_name)
		if (stop_list == -1):
			# can't read from given file
			return 0
		# end if
	# end if

	if ((flag==2) or (flag==3)):
		# open given file for reading
		file_name = text
		text_file = get_file_contents(file_name)
		if (text_file == -1):
			# can't read from given file
			return 0
		# end if

		list_length = len(text_file)
		text = ""
		for i in range(list_length):
			text = text + text_file[i]
		# end for

	# end if

	# get list of words using tf
	text_tf = tf (text)

	# lower each word
	list_size = len(stop_list)
	for i in range(list_size):
		stop_list[i] = string.lower(string.strip(stop_list[i]))
	# end for

	# compact original list
	text_tf = compact_list(text_tf)

	# verify each item : in stop_list?
	line_language = 0

	for item in stop_list:
		if (text_tf.has_key(item)):
			line_language = line_language + 1
		# end if
	# end for

	# value
	value = float(float(line_language)/float(list_size))

	# return result
	return value

# - ranking methods

def kNN (vector_1, vector_2):
	"k-Nearest Neighbour algorithm"

	first_list = vector_1
	other_list = vector_2

	global_distance = float(0)
	for item in first_list:
		first_value = float(first_list[item])
		other_value = float(0)
		if (other_list.has_key(item)):
			other_value = float(other_list[item])
		i = float(first_value - other_value)
		local_distance = float(i * i)
		global_distance = global_distance + local_distance
	# end for

	for item in other_list:
		other_value = float(other_list[item])
		first_value = 0
		if (first_list.has_key(item)):
			continue	# don't count again
		local_distance = float(other_value * other_value)
		global_distance = global_distance + local_distance
	# end for
		
	
	kNN = math.sqrt(global_distance)

	return kNN

def rank (request, document_list, order):
	"ranks the given documents according to the equality of their descriptors with the request"

	ranking_list = []
	list_no = 0
	for document in document_list:
		equality = comp_descriptors (request, document)
		ranking_entry = {
			"descriptors" : document,
			"equality" : equality,
			"list_no" : list_no
		}
		list_no += 1

		# search for an appropiate place to insert new entry (binary search)
		list_length = len(ranking_list) - 1
		right = list_length
		left = 0

		if(right == -1):
			# still an empty ranking list
			ranking_list.append(ranking_entry)
		else:
			if (ranking_list[left]["equality"] <= equality):
				ranking_list = [ranking_entry] + ranking_list
				continue
			# end if
				
			if (ranking_list[right]["equality"] >= equality):
				ranking_list.append(ranking_entry)
				continue
			# end if

			while (right>left):
				middle = (right + left) / 2
				value = ranking_list[middle]["equality"]

				if (value <= equality):
					right = middle
				else:
					left = middle + 1
				# end if
			# end while
			
			ranking_list = ranking_list[:middle+1] + [ranking_entry] + ranking_list[middle+1:]
		# end if
	# end for

	if (order == 1):
		# not descending
		new_ranking_list = []
		for item in ranking_list:
			new_ranking_list = [item] + new_ranking_list
		ranking_list = new_ranking_list
	# end if

	return ranking_list


