Weird first word read Python

  Kiến thức lập trình

I’m reading the first Harry Potter book as a UTF-8 file in Python (I’ve tried packages io and codecs), and the first word that is read, which is “harry” (lowercase because I first word tokenize the entire corpus with nltk), is read as 'ufeffharry'. I’m guessing this has to do with the encoding and perhaps because it’s the first word of the sentence.

I’ll include the code I’m using. The function on which focus should be put is process_file, and perhaps _process_tokens.

What can I do so that 'harry' is read instead?

class NGramTrainer(object):

    START_SYMBOL = '</s>'

    def __init__(self, n):
        """
        NGramTrainer constructor

        :param n: Size of grams
        """
        # Size of grams
        self.n = n

        # Sentence starter that adds 'padding' to each sentence
        self.sentence_starter = [self.START_SYMBOL] * (n - 1)

        # Sentence tokens from corpus
        self.tokenized_sentences = None

        # Collection of n-gram counts (pos i corresponds to (i+1)-ngrams)
        self.ngrams = [defaultdict(int) for _ in range(n)]

        # For each word, its ID.
        self.word2id = {}

        # For each ID, its word
        self.id2word = {}

        # Vocabulary size: mumber of unique words.
        self.num_unique_words = 0

        # Total number of words from corpus
        self.num_total_words = 0

    def process_file(self, file):
        """
        Processes the file f.
        """

        with io.open(file, mode='r', encoding='utf-8') as f:
            text = f.read().encode('utf-8').decode().lower()
        try:
            self.tokenized_sentences = nltk.sent_tokenize(text)
        except LookupError:
            nltk.download('punkt')
            nltk.download('wordnet')
            nltk.download('omw-1.4')
            self.tokenized_sentences = nltk.sent_tokenize(text)

        # Process each sentence.
        for i in tqdm(range(len(self.tokenized_sentences)),
                      desc="Processing corpus", colour='green'):
            tokenized_words = nltk.word_tokenize(self.tokenized_sentences[i])
            tokenized_words = self.sentence_starter + tokenized_words
            self._process_tokens(tokenized_words)
        print(self.ngrams[0])

    def _process_tokens(self, tokens):
        """
        Processes the list of tokens, and
        adjusts the ngram counts.

        :param tokens: The list of tokens to be processed.
        """

        # Update maps word2id & id2word.
        for token in tokens:
            if token not in self.word2id:
                self.word2id[token] = self.num_unique_words
                self.id2word[self.num_unique_words] = token
                self.num_unique_words += 1
        self.num_total_words += len(tokens)

        # We will count one start symbol per sentence
        self.ngrams[0][(self.START_SYMBOL,)] += 1

        # Iterate over all possible n-grams
        for i in range(self.n - 1, len(tokens)):
            # Obtain the n-gram stretching from pos i-n+1 to i --> interval [i-n+1, i+1)
            ngram = tuple(tokens[i - self.n + 1:i + 1])

            # Update the count for each l-gram, l = 1, ..., n
            for k in range(self.n):  # k = 0, ..., n-1
                self.ngrams[k][ngram[self.n - 1 - k:i + 1]] += 1

    def _get_stats(self):
        """
        Returns the model statistics
        """

        print("Harry", self.ngrams[0][('harry',)])

        # Initial row
        rows = [str(self.num_unique_words) + " " + str(self.num_total_words)]

        # For each k-grams, print their stats
        for k in range(self.n):

            # Get the k-grams
            kgrams = self.ngrams[k]

            # Record how many lines are gonna follow
            rows.append(str(len(kgrams)))

            # For each kgram (tuple) in the kgrams dict
            for kgram in kgrams:

                # Transform the words into string ids
                ids = ' '.join(str(self.word2id[word]) for word in kgram)

                # Compute the (log) probability
                # P(w_i | w_{i-n+1}, ..., w_{i-1}) =
                # c(w_{i-n+1}, ..., w_{i-1}, w_i) / c(w_{i-n+1}, ..., w_{i-1})

                # Get the number of occurrences of this kgram
                kgram_count = kgrams[kgram]
                if k == 0:  # Uni-gram --> Use log_prob for unigram_count
                    log_prob = math.log(kgram_count) - math.log(self.num_total_words)
                    ids += ' ' + kgram[0]  # Append word.
                else:  # Dealing with 2, 3, ... -grams.
                    # If the previous kgram doesn't exist (start symbols)
                    if kgram[:-1] not in self.ngrams[k - 1]:
                        log_prob = -float('inf')  # So that e^(-inf) = 0
                    else:
                        prev_kgram_count = self.ngrams[k - 1][kgram[:-1]]
                        log_prob = math.log(kgram_count) - math.log(prev_kgram_count)
                log_prob = format(log_prob, '.15f')
                rows.append(ids + " " + str(kgram_count) + " " + str(log_prob))
        rows.append(str(-1))  # EOF
        return rows

    def save_model(self, file):
        """
        Save model stats in the provided file
        """
        try:
            print("Saving model...")
            with io.open(file, mode='w', encoding='utf-8') as f:
                for row in self._get_stats():
                    f.write(row + 'n')
            print("Model saved!")
        except FileNotFoundError:
            print("The file", file, " was not found.")
        except IOError:
            print("An IOError occurred while saving the model.")

1

LEAVE A COMMENT