用Python编写干净可测试高质量的代码-技术开发专区

用Python编写干净可测试高质量的代码

作者：IBM 编辑：董建伟 2010-12-24 10:18 来源：IBM

　　highlight spy

#/usr/bin/python
# -*- coding: utf-8 -*-

"""
:mod:`highlight` -- Highlight Methods
===================================

.. module:: highlight
   :platform: Unix, Windows
   :synopsis: highlight document snippets that match a query.
.. moduleauthor:: Noah Gift

Requirements::
    1.  You will need to install the ntlk library to run this code.
        http://www.nltk.org/download
    2.  You will need to download the data for the ntlk:
        See http://www.nltk.org/data::

        import nltk
        nltk.download()

"""

import re
import logging

import nltk

#Globals
logging.basicConfig()
LOG = logging.getLogger("highlight")
LOG.setLevel(logging.INFO)

class HighlightDocumentOperations(object):

    """Highlight Operations for a Document"""

    def __init__(self, document=None, query=None):
        """
        Kwargs:
            document (str):
            query (str):

        """
        self._document = document
        self._query = query

    @staticmethod
    def _custom_highlight_tag(phrase,
                              start="<strong>",
                              end="</strong>"):

        """Injects an open and close highlight tag after a word

        Args:
            phrase (str) - A word or phrase.
        Kwargs:
        start (str) - An opening tag.  Defaults to <strong>
        end (str) - A closing tag.  Defaults to </strong>
        Returns:
            (str) word or phrase with custom opening and closing tags

        >>> h = HighlightDocumentOperations()
        >>> h._custom_highlight_tag("foo")
        'foo'
        >>>

        """
        tagged_phrase = "{0}{1}{2}".format(start, phrase, end)
        return tagged_phrase

    def _doc_to_sentences(self):
        """Takes a string document and converts it into a list of sentences

        Unfortunately, this approach might be a tad naive for production
        because some segments that are split on a period are really an
        abbreviation, and to make things even more complicated, an
        abbreviation can also be the end of a sentence::
            http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html

        Returns:
            (generator) A generator object of a tokenized sentence tuple,
            with the list position of sentence as the first portion of
            the tuple, such as:  (0, "This was the first sentence")

        """

        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        sentences = tokenizer.tokenize(self._document)
        for sentence in enumerate(sentences):
            yield sentence

    @staticmethod
    def _score_sentences(sentence, querydict):
        """Creates a scoring system for each sentence by substitution analysis

        Tokenizes each sentence, counts characters
        in sentence, and pass it back as nested tuple

        Returns:
            (tuple) - (score (int), (count (int), position (int),
                   raw sentence (str))

        """

        position, sentence = sentence
        count = len(sentence)
        regex = re.compile('|'.join(map(re.escape, querydict)))
        score = len(re.findall(regex, sentence))
        processed_score = (score, (count, position, sentence))
        return processed_score

    def _querystring_to_dict(self, split_token="+"):
        """Converts query parameters into a dictionary

        Returns:
            (dict)- dparams, a dictionary of query parameters

        """

        params = self._query.split(split_token)
        dparams = dict([(key, self._custom_highlight_tag(key)) for\
                    key in params])
        return dparams

    @staticmethod
    def _word_frequency_sort(sentences):
        """Sorts sentences by score frequency, yields sorted result

        This will yield the highest score count items first.

        Args:
            sentences (list) - a nested tuple inside of list
            [(0, (90, 3, "The crust/dough was just way too effin' dry for me.
            Yes, I know what 'cornmeal' is, thanks."))]

        """

        sentences.sort()
        while sentences:
            yield sentences.pop()

    def _create_snippit(self, sentences, max_characters=175):
        """Creates a snippet from a sentence while keeping it under max_chars

        Returns a sorted list with max characters.  The sort is an attempt
        to rebuild the original document structure as close as possible,
        with the new sorting by scoring and the limitation of max_chars.

        Args:
            sentences (generator) - sorted object to turn into a snippit
            max_characters (int) - optional max characters of snippit

        Returns:
            snippit (list) - returns a sorted list with a nested tuple that
            has the first index holding the original position of the list::

            [(0, (90, 3, "The crust/dough was just way too effin' dry for me.
            Yes, I know what 'cornmeal' is, thanks."))]

        """

        snippit = []
        total = 0
        for sentence in self._word_frequency_sort(sentences):
            LOG.debug("Creating snippit", sentence)
            score, (count, position, raw_sentence) = sentence
            total += count
            if total < max_characters:
                #position now gets converted to index 0 for sorting later
                snippit.append(((position), score, count, raw_sentence))

        #try to reassemble document by original order by doing a simple sort
        snippit.sort()
        return snippit

    @staticmethod
    def _multiple_string_replace(string_to_replace, dict_patterns):
        """Performs a multiple replace in a string with dict pattern.

        Borrowed from Python Cookbook.

        Args:
            string_to_replace (str) - String to be multi-replaced
            dict_patterns (dict) - A dict full of patterns

        Returns:
            (str) - Multiple replaced string.

        """

        regex = re.compile('|'.join(map(re.escape, dict_patterns)))
        def one_xlat(match):
            """Closure that is called repeatedly during multi-substitution.

            Args:
                match (SRE_Match object)
            Returns:
                partial string substitution (str)

            """

            return dict_patterns[match.group(0)]

        return regex.sub(one_xlat, string_to_replace)

    def _reconstruct_document_string(self, snippit, querydict):
        """Reconstructs string snippit, build tags, and return string

        A helper function for highlight_doc.

        Args:
            string_to_replace (list) - A list of nested tuples, containing
            this pattern::

            [(0, (90, 3, "The crust/dough was just way too effin' dry for me.
            Yes, I know what 'cornmeal' is, thanks."))]

            dict_patterns (dict) - A dict full of patterns

        Returns:
            (str) The most relevant snippet with the query terms highlighted.

        """

        snip = []
        for entry in snippit:
            score = entry[1]
            sent = entry[3]
            #if we have matches, now do the multi-replace
            if score:
                sent = self._multiple_string_replace(sent,
                                                    querydict)
            snip.append(sent)
        highlighted_snip = " ".join(snip)

        return highlighted_snip

    def highlight_doc(self):
        """Finds the most relevant snippit with the query terms highlighted

        Returns:
            (str) The most relevant snippet with the query terms highlighted.

        """

        #tokenize to sentences, and convert query to a dict
        sentences = self._doc_to_sentences()
        querydict = self._querystring_to_dict()

        #process and score sentences
         scored_sentences = []
        for sentence in sentences:
            scored = self._score_sentences(sentence, querydict)
            scored_sentences.append(scored)

        #fit into max characters, and sort by original position
        snippit = self._create_snippit(scored_sentences)
        #assemble back into string
        highlighted_snip = self._reconstruct_document_string(snippit,
                                                             querydict)

        return highlighted_snip

第1页：简介第2页：pygenie的圈复杂度输出第3页：highlight spy 第4页：test_highlight.py 第5页：test_functional_highlight.py 第6页：Pylint

关注我们

用Python编写干净 可测试 高质量的代码

用Python编写干净可测试高质量的代码