Source code for microtc.textmodel

# Copyright 2016-2017 Eric S. Tellez

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import unicodedata
import numpy as np
from microtc.params import OPTION_DELETE, OPTION_GROUP, OPTION_NONE
from microtc.emoticons import EmoticonClassifier
import os
from scipy.sparse import csr_matrix
from microtc.utils import get_class, SparseMatrix
from typing import Union


PUNCTUACTION = ";:,.@\\-\"'/"
SYMBOLS = "()[]¿?¡!{}~<>|"
SKIP_SYMBOLS = set(PUNCTUACTION + SYMBOLS)
SKIP_SYMBOLS_AND_SPACES = set(PUNCTUACTION + SYMBOLS + '\t\n\r ')
# SKIP_WORDS = set(["…", "..", "...", "...."])
WEIGHTING = dict(tfidf="microtc.weighting.TFIDF",
                 tf="microtc.weighting.TF",
                 entropy="microtc.weighting.Entropy")


[docs]def norm_chars(text, del_diac=True, del_dup=True, del_punc=False): """ Transform text by removing diacritics, duplicates, and punctuation. It adds ~ at the beginning, the end, and the spaces are changed by ~. :param text: Text :type text: str :param del_diac: Delete diacritics :type del_diac: bool :param del_dup: Delete duplicates :type del_dup: bool :param del_punc: Delete punctuation symbols :type del_punc: bool :rtype: str Example: >>> from microtc.textmodel import norm_chars >>> norm_chars("Life is good at Méxicoo.") '~Life~is~god~at~Mexico.~' """ L = ['~'] prev = '~' for u in unicodedata.normalize('NFD', text): if del_diac: o = ord(u) if 0x300 <= o and o <= 0x036F: continue if u in ('\n', '\r', ' ', '\t', '\xa0'): u = '~' elif del_dup and prev == u: continue elif del_punc and u in SKIP_SYMBOLS: prev = u continue prev = u L.append(u) L.append('~') return "".join(L)
[docs]def get_word_list(text): """ Transform a text (begining and ending with ~) to list words. It is called after :py:func:`microtc.textmodel.norm_chars`. Example >>> from microtc.textmodel import get_word_list >>> get_word_list("~Someone's house.~") ['Someone', 's', 'house'] :param text: text :type text: str :rtype: list """ L = [] prev = ' ' for u in text[1:len(text)-1]: if u in SKIP_SYMBOLS: u = ' ' if prev == ' ' and u == ' ': continue if prev == ' ' and u == "'": continue L.append(u) prev = u return ("".join(L)).split()
[docs]def expand_qgrams(text, qsize, output): """Expands a text into a set of q-grams :param text: Text :type text: str :param qsize: q-gram size :type qsize: int :param output: output :type output: list :returns: output :rtype: list Example: >>> from microtc.textmodel import expand_qgrams >>> output = list() >>> expand_qgrams("Good morning.", 3, output) ['q:Goo', 'q:ood', 'q:od ', 'q:d m', 'q: mo', 'q:mor', 'q:orn', 'q:rni', 'q:nin', 'q:ing', 'q:ng.'] """ _ = ["".join(a) for a in zip(*[text[i:] for i in range(qsize)])] [output.append("q:" + x) for x in _] return output
[docs]def expand_qgrams_word_list(wlist, qsize, output, sep='~'): """Expands a list of words into a list of q-grams. It uses `sep` to join words :param wlist: List of words computed by :py:func:`microtc.textmodel.get_word_list`. :type wlist: list :param qsize: q-gram size of words :type qsize: int :param output: output :type output: list :param sep: String used to join the words :type sep: str :returns: output :rtype: list Example: >>> from microtc.textmodel import expand_qgrams_word_list >>> wlist = ["Good", "morning", "Mexico"] >>> expand_qgrams_word_list(wlist, 2, list()) ['Good~morning', 'morning~Mexico'] """ n = len(wlist) for start in range(n - qsize + 1): t = sep.join(wlist[start:start+qsize]) output.append(t) return output
[docs]def expand_skipgrams_word_list(wlist, qsize, output, sep='~'): """Expands a list of words into a list of skipgrams. It uses `sep` to join words :param wlist: List of words computed by :py:func:`microtc.textmodel.get_word_list`. :type wlist: list :param qsize: (qsize, skip) qsize is the q-gram size and skip is the number of words ahead. :type qsize: tuple :param output: output :type output: list :param sep: String used to join the words :type sep: str :returns: output :rtype: list Example: >>> from microtc.textmodel import expand_skipgrams_word_list >>> wlist = ["Good", "morning", "Mexico"] >>> expand_skipgrams_word_list(wlist, (2, 1), list()) ['Good~Mexico'] """ n = len(wlist) qsize, skip = qsize for start in range(n - (qsize + (qsize - 1) * skip) + 1): if qsize == 2: t = wlist[start] + sep + wlist[start+1+skip] else: t = sep.join([wlist[start + i * (1+skip)] for i in range(qsize)]) output.append(t) return output
[docs]class TextModel(SparseMatrix): """ :param docs: Corpus :type docs: list :param text: In the case corpus is a dict then text is the key containing the text :type text: str :param num_option: Transformations on numbers (none | group | delete) :type num_option: str :param usr_option: Transformations on users (none | group | delete) :type usr_option: str :param url_option: Transformations on urls (none | group | delete) :type url_option: str :param emo_option: Transformations on emojis and emoticons (none | group | delete) :type emo_option: str :param hashtag_option: Transformations on hashtag (none | group | delete) :type hashtag_option: str :param ent_option: Transformations on entities (none | group | delete) :type ent_option: str :param lc: Lower case :type lc: bool :param del_dup: Remove duplicates e.g. hooola -> hola :type del_dup: bool :param del_punc: Remove punctuation symbols :type del_punc: True :param del_diac: Remove diacritics :type del_diac: bool :param token_list: Tokens > 0 qgrams < 0 word-grams :type token_list: list :param token_min_filter: Keep those tokens that appear more times than the parameter (used in weighting class) :type token_min_filter: int or float :param token_max_filter: Keep those tokens that appear less times than the parameter (used in weighting class) :type token_max_filter: int or float :param q_grams_words: Compute q-grams only on words :type q_grams_words: bool :param select_ent: :type select_ent: bool :param select_suff: :type select_suff: bool :param select_conn: :type select_conn: bool :param weighting: Weighting scheme (tfidf | tf | entropy) :type weighting: class or str Usage: >>> from microtc.textmodel import TextModel >>> corpus = ['buenos dias', 'catedras conacyt', 'categorizacion de texto ingeotec'] Using default parameters >>> textmodel = TextModel().fit(corpus) Represent a text whose words are in the corpus and one that does not >>> vector = textmodel['categorizacion ingoetec'] >>> vector2 = textmodel['cat'] Using a different token_list >>> textmodel = TextModel(token_list=[[2, 1], -1, 3, 4]).fit(corpus) >>> vector = textmodel['categorizacion ingoetec'] >>> vector2 = textmodel['cat'] Train a classifier >>> from sklearn.svm import LinearSVC >>> y = [1, 0, 0] >>> textmodel = TextModel().fit(corpus) >>> m = LinearSVC().fit(textmodel.transform(corpus), y) >>> m.predict(textmodel.transform(corpus)) array([1, 0, 0]) """ def __init__(self, docs=None, text: str='text', num_option: str=OPTION_GROUP, usr_option: str=OPTION_GROUP, url_option: str=OPTION_GROUP, emo_option: str=OPTION_GROUP, hashtag_option: str=OPTION_NONE, ent_option: str=OPTION_NONE, lc: bool=True, del_dup: bool=True, del_punc: bool=False, del_diac: bool=True, token_list: list=[-1], token_min_filter: Union[int, float]=0, token_max_filter: Union[int, float]=1, select_ent: bool=False, select_suff: bool=False, select_conn: bool=False, weighting: str='tfidf', q_grams_words: bool=False, max_dimension: bool=False): self._text = os.getenv('TEXT', default=text) self.del_diac = del_diac self.num_option = num_option self.usr_option = usr_option self.url_option = url_option self.emo_option = emo_option self.ent_option = ent_option self.select_ent = select_ent self.select_suff = select_suff self.select_conn = select_conn self.hashtag_option = hashtag_option self.lc = lc self.del_dup = del_dup self.del_punc = del_punc self.token_list = token_list self.token_min_filter = token_min_filter self.token_max_filter = token_max_filter self.weighting = weighting self.weighting = WEIGHTING.get(weighting, weighting) self._q_grams_words = q_grams_words self._max_dimension = max_dimension if emo_option == OPTION_NONE: self.emo_map = None else: self.emo_map = EmoticonClassifier() if docs is not None and len(docs): self.fit(docs) @property def q_grams_words(self): try: return self._q_grams_words except AttributeError: return False @property def max_dimension(self): try: return self._max_dimension except AttributeError: return False # @property # def token_list(self): # """Tokenizer parameters""" # return self._token_list # @token_list.setter # def token_list(self, value): # """ # >>> from microtc import TextModel # >>> tm = TextModel() # >>> tm.token_list = [-2, -1] # >>> tm.token_list # [-2, -1] # """ # self._token_list = value # for x in ['_q_grams', '_n_grams', '_skip_grams']: # try: # delattr(self, x) # except AttributeError: # continue @property def q_grams(self): """q-grams of characters >>> from microtc import TextModel >>> tm = TextModel(token_list=[-1, 3, (2, 1)]) >>> tm.q_grams [3] """ try: q_grams = self._q_grams except AttributeError: q_grams = [x for x in self.token_list if isinstance(x, int) and x > 0] self._q_grams = q_grams return q_grams @property def n_grams(self): """n-grams of words >>> from microtc import TextModel >>> tm = TextModel(token_list=[-1, 3, (2, 1)]) >>> tm.n_grams [-1] """ try: output = self._n_grams except AttributeError: output = [x for x in self.token_list if isinstance(x, int) and x < 0] self._n_grams = output return output @property def skip_grams(self): """skip-grams >>> from microtc import TextModel >>> tm = TextModel(token_list=[-1, 3, (2, 1)]) >>> tm.skip_grams [(2, 1)] """ try: output = self._skip_grams except AttributeError: output = [x for x in self.token_list if not isinstance(x, int)] self._skip_grams = output return output
[docs] def fit(self, X): """ Train the model :param X: Corpus :type X: list :rtype: instance """ tokens = [self.tokenize(d) for d in X] self.model = get_class(self.weighting)(tokens, X=X, token_min_filter=self.token_min_filter, token_max_filter=self.token_max_filter, max_dimension=self.max_dimension) return self
def __getitem__(self, text): """Convert text into a vector :param text: Text to be transformed :type text: str :rtype: list """ return self.model[self.tokenize(text)]
[docs] @classmethod def params(cls): """ Parameters >>> from microtc.textmodel import TextModel >>> TextModel.params() odict_keys(['docs', 'text', 'num_option', 'usr_option', 'url_option', 'emo_option', 'hashtag_option', 'ent_option', 'lc', 'del_dup', 'del_punc', 'del_diac', 'token_list', 'token_min_filter', 'token_max_filter', 'select_ent', 'select_suff', 'select_conn', 'weighting', 'q_grams_words', 'max_dimension']) """ import inspect sig = inspect.signature(cls) params = sig.parameters.keys() return params
[docs] def transform(self, texts): """Convert test into a vector :param texts: List of text to be transformed :type texts: list :rtype: list Example: >>> from microtc.textmodel import TextModel >>> corpus = ['buenos dias catedras', 'catedras conacyt'] >>> textmodel = TextModel().fit(corpus) >>> X = textmodel.transform(corpus) """ return self.tonp([self.__getitem__(x) for x in texts])
def vectorize(self, text): raise RuntimeError('Not implemented')
[docs] def tokenize(self, text): """Transform text to tokens. The procedure is: - :py:func:`microtc.textmodel.TextModel.text_transformations`. - :py:func:`microtc.textmodel.TextModel.compute_tokens`. - :py:func:`microtc.textmodel.TextModel.select_tokens`. :param text: Text :type text: str or list :rtype: list Example: >>> from microtc.textmodel import TextModel >>> tm = TextModel() >>> tm.tokenize("buenos dias") ['buenos', 'dias'] >>> tm.tokenize(["buenos", "dias", "tenga usted"]) ['buenos', 'dias', 'tenga', 'usted'] """ if isinstance(text, dict): text = self.get_text(text) if isinstance(text, (list, tuple)): tokens = [] for _text in text: tokens.extend(self._tokenize(_text)) return tokens else: return self._tokenize(text)
[docs] def get_text(self, text): """Return self._text key from text :param text: Text :type text: dict """ return text[self._text]
@property def disable_text_transformations(self): try: return self._disable_text_transformations except AttributeError: return False @disable_text_transformations.setter def disable_text_transformations(self, v): self._disable_text_transformations = v
[docs] def text_transformations(self, text): """ Text transformations. It starts by analyzing emojis, hashtags, entities, lower case, numbers, URL, and users. After these transformations are applied to the text, it calls :py:func:`microtc.textmodel.norm_chars`. :param text: :type text: str :rtype: str Example: >>> from microtc.textmodel import TextModel >>> tm = TextModel(del_dup=False) >>> tm.text_transformations("Life is good at México @mgraffg.") '~life~is~good~at~mexico~_usr~' """ if text is None: text = '' if isinstance(text, dict): text = self.get_text(text) if self.disable_text_transformations: return text if self.emo_map: text = self.emo_map.replace(text, option=self.emo_option) if self.select_ent: text = " ".join(re.findall(r"(@\S+|#\S+|[A-Z]\S+)", text)) if self.hashtag_option == OPTION_DELETE: text = re.sub(r"#\S+", "", text) elif self.hashtag_option == OPTION_GROUP: text = re.sub(r"#\S+", "_htag", text) if self.ent_option == OPTION_DELETE: text = re.sub(r"[A-Z][a-z]+", "", text) elif self.ent_option == OPTION_GROUP: text = re.sub(r"[A-Z][a-z]+", "_ent", text) if self.lc: text = text.lower() if self.num_option == OPTION_DELETE: text = re.sub(r"\d\d*\.?\d*|\d*\.\d\d*", "", text) elif self.num_option == OPTION_GROUP: text = re.sub(r"\d\d*\.?\d*|\d*\.\d\d*", "_num", text) if self.url_option == OPTION_DELETE: text = re.sub(r"https?://\S+", "", text) elif self.url_option == OPTION_GROUP: text = re.sub(r"https?://\S+", "_url", text) if self.usr_option == OPTION_DELETE: text = re.sub(r"@\S+", "", text) elif self.usr_option == OPTION_GROUP: text = re.sub(r"@\S+", "_usr", text) return norm_chars(text, del_diac=self.del_diac, del_dup=self.del_dup, del_punc=self.del_punc)
def get_word_list(self, *args, **kwargs): return get_word_list(*args, **kwargs) def compute_n_grams(self, textlist): output = [] for q in self.n_grams: expand_qgrams_word_list(textlist, abs(q), output) return output def compute_skip_grams(self, textlist): output = [] for q in self.skip_grams: expand_skipgrams_word_list(textlist, q, output) return output def compute_q_grams(self, text): output = [] for q in self.q_grams: expand_qgrams(text, q, output) return output
[docs] def compute_q_grams_words(self, textlist): """ >>> from microtc import TextModel >>> tm = TextModel(token_list=[3]) >>> tm.compute_q_grams_words(['abc', 'def']) ['q:~ab', 'q:abc', 'q:bc~', 'q:~de', 'q:def', 'q:ef~'] """ output = [] textlist = ['~' + x + '~' for x in textlist] for qsize in self.q_grams: _ = qsize - 1 extra = [x for x in textlist if len(x) >= _] qgrams = [["".join(output) for output in zip(*[text[i:] for i in range(qsize)])] for text in extra] for _ in qgrams: [output.append("q:" + x) for x in _] return output
[docs] def compute_tokens(self, text): """ Compute tokens from a text using q-grams of characters and words, and skip-grams. :param text: Text transformed by :py:func:`microtc.textmodel.TextModel.text_transformations`. :type text: str :rtype: list Example: >>> from microtc.textmodel import TextModel >>> tm = TextModel(token_list=[-2, -1]) >>> tm.compute_tokens("~Good morning~") [['Good~morning', 'Good', 'morning'], [], []] >>> tm = TextModel(token_list=[3]) >>> tm.compute_tokens('abc def') [[], [], ['q:abc', 'q:bc ', 'q:c d', 'q: de', 'q:def']] >>> tm = TextModel(token_list=[(2, 1)]) >>> tm.compute_tokens('~abc x de~') [[], ['abc~de'], []] >>> tm = TextModel(token_list=[3], q_grams_words=True) >>> tm.compute_tokens('~abc def~') [[], [], ['q:~ab', 'q:abc', 'q:bc~', 'q:~de', 'q:def', 'q:ef~']] """ L = [] textlist = self.get_word_list(text) L.append(self.compute_n_grams(textlist)) L.append(self.compute_skip_grams(textlist)) if self.q_grams_words: L.append(self.compute_q_grams_words(textlist)) else: L.append(self.compute_q_grams(text)) return L
[docs] def select_tokens(self, L): """ Filter tokens using suffix or connections :param L: list of tokens :type L: list :rtype: list """ if self.select_suff: L = [tok for tok in L if tok[-1] in SKIP_SYMBOLS_AND_SPACES] if self.select_conn: L = [tok for tok in L if '~' in tok and tok[0] != '~' and tok[-1] != '~'] return L
def _tokenize(self, text): text = self.text_transformations(text) L = [] for _ in self.compute_tokens(text): L += _ L = self.select_tokens(L) if len(L) == 0: L = ['~'] return L @property def num_terms(self): """Dimension which is the number of terms of the corpus >>> from microtc.textmodel import TextModel >>> corpus = ['buenos dias', 'catedras conacyt', 'categorizacion de texto ingeotec'] >>> textmodel = TextModel().fit(corpus) >>> _ = textmodel.transform(corpus) >>> textmodel.num_terms 8 :rtype: int """ return self.model.num_terms @property def token_weight(self): """ Weight associated to each token id >>> from microtc.textmodel import TextModel >>> corpus = ['buenos dias', 'catedras conacyt', 'categorizacion de texto ingeotec'] >>> textmodel = TextModel().fit(corpus) >>> _ = textmodel.transform(corpus) >>> textmodel.token_weight[5] 1.584962500721156 """ return self.model.wordWeight @property def id2token(self): """ Token identifier to token >>> from microtc.textmodel import TextModel >>> corpus = ['buenos dias', 'catedras conacyt', 'categorizacion de texto ingeotec'] >>> textmodel = TextModel().fit(corpus) >>> _ = textmodel.transform(corpus) >>> textmodel.id2token[5] 'de' """ try: return self._id2token except AttributeError: self._id2token = {v:k for k, v in self.token2id.items()} return self._id2token @property def token2id(self): """ Token to token identifier >>> from microtc.textmodel import TextModel >>> corpus = ['buenos dias', 'catedras conacyt', 'categorizacion de texto ingeotec'] >>> textmodel = TextModel().fit(corpus) >>> _ = textmodel.transform(corpus) >>> textmodel.token2id['de'] 5 """ return self.model.word2id