Source code for ztlearn.utils.text_utils

# -*- coding: utf-8 -*-

import numpy as np

#-----------------------------------------------------------------------------#
#                       TEXT UTILITY FUNCTIONS                                #
#-----------------------------------------------------------------------------#

[docs]def gen_char_sequence_xtym(text, maxlen, step, tensor_dtype = np.int): chars = sorted(list(set(text))) len_chars = len(chars) len_text = len(text) char_to_indices = {c: i for i, c in enumerate(chars)} # indices_to_char = {i: c for i, c in enumerate(chars)} sentences, next_chars = [], [] for i in range(0, len_text - maxlen, step): sentences.append(text[i: i + maxlen]) next_chars.append(text[i + maxlen]) len_sentences = len(sentences) x = np.zeros((len_sentences, maxlen, len_chars), dtype = tensor_dtype) y = np.zeros((len_sentences, len_chars), dtype = tensor_dtype) for i, sentence in enumerate(sentences): for t, char in enumerate(sentence): x[i, t, char_to_indices[char]] = 1 y[i, char_to_indices[next_chars[i]]] = 1 return x, y, len_chars
[docs]def gen_char_sequence_xtyt(text, maxlen, step, tensor_dtype = np.int): chars = sorted(list(set(text))) len_chars = len(chars) len_text = len(text) char_to_indices = {c: i for i, c in enumerate(chars)} # indices_to_char = {i: c for i, c in enumerate(chars)} sentences, next_chars = [], [] for i in range(0, len_text - maxlen + 1, step): sentences.append(text[i : i + maxlen]) next_chars.append(text[i+1 : i+1 + maxlen]) len_sentences = len(sentences) x = np.zeros((len_sentences, maxlen, len_chars), dtype = tensor_dtype) y = np.zeros((len_sentences, maxlen, len_chars), dtype = tensor_dtype) for i, sentence in enumerate(sentences): for t, char in enumerate(sentence): x[i, t, char_to_indices[char]] = 1 for i, sentence in enumerate(next_chars): for t, char in enumerate(sentence): y[i, t, char_to_indices[char]] = 1 return x, y, len_chars
[docs]def pad_sequence(sequence, maxlen = None, dtype = 'int32', padding = 'pre', truncating = 'pre', value = 0.0): """ pad or truncate sequences depending on size of maxlen - longest sequence """ if (maxlen is None): return sequence np_sequence = np.array(sequence) sequence_size = np_sequence.size if (maxlen > sequence_size): pad_size = maxlen - sequence_size if (padding == 'pre'): front_pad, back_pad = pad_size, 0 if (padding == 'post'): front_pad, back_pad = 0, pad_size paded = np.pad(sequence, (front_pad, back_pad), 'constant', constant_values = (value, value)) return paded if (sequence_size > maxlen): trunc_size = sequence_size - maxlen if (truncating == 'pre'): truncated = np_sequence[:-trunc_size] if (truncating == 'post'): truncated = np_sequence[trunc_size:] return truncated if(sequence_size == maxlen): return np_sequence
[docs]def get_sentence_tokens(text_list, maxlen = None, dtype = 'int32'): unique_words = list(set(text_list.split())) # get the unique words import re sentences = re.split(r'(?<=[.?!])\s+', text_list) if maxlen is None: maxlen = max(longest_sentence(sentences)) sentence_index_list = [] for _, s in enumerate(sentences): sentence_index = list(map(unique_words.index, s.split())) padded_index = pad_sequence(sentence_index, maxlen) sentence_index_list.append(padded_index) return np.array(sentence_index_list), len(unique_words), maxlen
[docs]def longest_sentence(sentences): """ find longest sentence in a list of sentences """ if isinstance(sentences, list): yield len(sentences) for sentence in sentences: yield from longest_sentence(sentence)