| Trees | Indices | Help |
|
|---|
|
|
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright 2006 Zuza Software Foundation
4 #
5 # This file is part of translate.
6 #
7 # translate is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
11 #
12 # translate is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
16 #
17 # You should have received a copy of the GNU General Public License
18 # along with translate; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 #
21
22 """Module to deal with different types and uses of segmentation"""
23
24 #XXX: This module is now deprecated: Use language specific segmenters in the
25 # lang package (character_iter, word_iter, sentence_iter, etc.).
26
27 punctuation = u".,;:!?-@#$%^*_()[]{}/\\'\"<>‘’‚‛“”„‟′″‴‵‶‷‹›«»±³¹²°¿©®×£¥"
28
30 """Returns an iterator over the characters in text."""
31 #We don't return more than one consecutive whitespace character
32 prev = 'A'
33 for c in text:
34 if c.isspace() and prev.isspace():
35 continue
36 prev = c
37 if not (c in punctuation):
38 yield c.lower()
39
43
45 """Returns an iterator over the words in text."""
46 #TODO: Consider replacing puctuation with space before split()
47 for w in text.split():
48 yield w.strip(punctuation).lower()
49
53
55 """Returns an iterator over the senteces in text."""
56 #TODO: This is very naïve. We really should consider all punctuation,
57 #and return the punctuation with the sentence.
58 #TODO: Search for capital letter start with next sentence to avoid
59 #confusion with abbreviations. And remember Afrikaans "'n" :-)
60 for s in text.split(". "):
61 yield s.strip()
62
66
| Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Wed Mar 26 12:49:41 2008 | http://epydoc.sourceforge.net |