
# ----------------------------------------------------------
# advas
# advanced search algorithms implemented as a python module
#
# (C) 2002 Frank Hofmann, Chemnitz, Germany
# email fh@efho.de
#
# example for using advas funcions to identify the language 
# of a given document and to derive descriptors for indexing
# from the words given in the document
#
# ----------------------------------------------------------

# changed 2002-09-23

# import advas module
import advas
import string

# define lines of text
line_en = "The sense of the single letter options may be inverted by using + instead of -. Some of the single letter option names refer to an option being off, in which case the inversion of that name refers to the option being on. For example, +n is the short name of exec, and -n is the short name of its inversion, noexec."

line_de = "Ein betrachtetes Dokument wird linear verarbeitet. In einer Liste wird gespeichert, wie haeufig jedes einzelne Wort im Dokument vorkommt. Man nimmt an, dass, je haeufiger ein Wort im Text enthalten ist, dieses Wort umso relevanter im Sinne von bedeutungstragend fuer den Inhalt des Textes ist. Diese Termgewichte werden in einem Vektor zusammengefasst. Jede Position im Vektor repraesentiert einen Term. Der Wert an dieser Position enthaelt das spezifische Termgewicht eines Wortes fr diesen Text oder dieses Dokument"

# get list of words using tf
list_en = advas.tf (line_en)
list_de = advas.tf (line_de)

# read stop lists
stoplist_en = []
stoplist_de = []
f = open("stoplist_EN.txt", "r")
stoplist_en = f.readlines()
f.close()

f = open("stoplist_DE.txt", "r")
stoplist_de = f.readlines()
f.close()

list_size = len(stoplist_en)
for i in range(list_size):
	stoplist_en[i] = string.lower(string.strip(stoplist_en[i]))
# end for

list_size = len(stoplist_de)
for i in range(list_size):
	stoplist_de[i] = string.lower(string.strip(stoplist_de[i]))
# end for

# compact original lists
list_en = advas.compact_list(list_en)
list_de = advas.compact_list(list_de)

# verify each item : in stop_list?
line_en_en = 0
line_en_de = 0
line_de_en = 0
line_de_de = 0

for item in stoplist_en:
	if (list_en.has_key(item)):
		line_en_en = line_en_en + 1
	# end if

	if (list_de.has_key(item)):
		line_en_de = line_en_de + 1
	# end if
# end for

for item in stoplist_de:
	if (list_en.has_key(item)):
		line_de_en = line_de_en + 1
	# end if

	if (list_de.has_key(item)):
		line_de_de = line_de_de + 1
	# end if
# end for

print "first text:"
print "en_en :", line_en_en
print "en_de :", line_en_de
if (line_en_en > line_en_de):
	print "language: english"
else:
	print "language: german"
# end if

print "\nsecond text:"
print "de_en :", line_de_en
print "de_de :", line_de_de
if (line_de_en > line_de_de):
	print "language: english"
else:
	print "language: german"
# end if

# reduce possible index terms
cv_stoplist_en = advas.convert_list_into_dictionary(stoplist_en, 1)
cv_stoplist_de = advas.convert_list_into_dictionary(stoplist_de, 1)

# remove items from stop list
new_line_en = advas.remove_items(list_en, cv_stoplist_en)
new_line_de = advas.remove_items(list_de, cv_stoplist_de)

#print "\nenglish stop list:"
#items = cv_stoplist_en.keys()
#for i in items:
#	print i, cv_stoplist_en[i]
# end for

# output result
print "\nenglish index terms:"
items = new_line_en.keys()
for i in items:
	print i, new_line_en[i]
# end for

print "\ngerman index terms:"
items = new_line_de.keys()
for i in items:
	print i, new_line_de[i]
# end for

# reduce using the n-gram stemming method
words_en = advas.convert_dictionary_into_list(new_line_en)
words_de = advas.convert_dictionary_into_list(new_line_de)

result_en = advas.ngram_stemmer (words_en, 2, 0.8)
result_de = advas.ngram_stemmer (words_de, 2, 0.8)

print "\nenglish index terms (stemmed):"
print result_en

print "\ngerman index terms (stemmed):"
print result_de

