3
'^                 @   s0   d dl Z d dlZddlmZ G dd deZdS )    N   )ProbingStatec               @   sn   e Zd ZdZdddZdd Zedd Zd	d
 Zedd Z	dd Z
edd Zedd Zedd ZdS )CharSetProbergffffff?Nc             C   s   d | _ || _tjt| _d S )N)_statelang_filterlogging	getLogger__name__logger)selfr    r   8/tmp/pip-build-8app2_gc/chardet/chardet/charsetprober.py__init__'   s    zCharSetProber.__init__c             C   s   t j| _d S )N)r   	DETECTINGr   )r   r   r   r   reset,   s    zCharSetProber.resetc             C   s   d S )Nr   )r   r   r   r   charset_name/   s    zCharSetProber.charset_namec             C   s   d S )Nr   )r   bufr   r   r   feed3   s    zCharSetProber.feedc             C   s   | j S )N)r   )r   r   r   r   state6   s    zCharSetProber.statec             C   s   dS )Ng        r   )r   r   r   r   get_confidence:   s    zCharSetProber.get_confidencec             C   s   t jdd| } | S )Ns   ([ -])+    )resub)r   r   r   r   filter_high_byte_only=   s    z#CharSetProber.filter_high_byte_onlyc             C   sb   t  }tjd| }xJ|D ]B}|j|dd  |dd }|j  rP|dk rPd}|j| qW |S )u9  
        We define three types of bytes:
        alphabet: english alphabets [a-zA-Z]
        international: international characters [-ÿ]
        marker: everything else [^a-zA-Z-ÿ]

        The input buffer can be thought to contain a series of words delimited
        by markers. This function works to filter all words that contain at
        least one international character. All contiguous sequences of markers
        are replaced by a single space ascii character.

        This filter applies to all scripts which do not use English characters.
        s%   [a-zA-Z]*[-]+[a-zA-Z]*[^a-zA-Z-]?Nr      r   r   )	bytearrayr   findallextendisalpha)r   filteredwordsword	last_charr   r   r   filter_international_wordsB   s    
z(CharSetProber.filter_international_wordsc             C   s   t  }d}d}xtt| D ]r}| ||d  }|dkr>d}n|dkrJd}|dk r|j  r||kr| r|j| ||  |jd |d }qW |s|j| |d	  |S )
a  
        Returns a copy of ``buf`` that retains only the sequences of English
        alphabet and high byte characters that are not between <> characters.
        Also retains English alphabet and high byte characters immediately
        before occurrences of >.

        This filter can be applied to all scripts which contain both English
        characters and extended ASCII characters, but is currently only used by
        ``Latin1Prober``.
        Fr   r      >   <Tr   r   N)r   rangelenr   r   )r   r    in_tagprevcurrbuf_charr   r   r   filter_with_english_lettersg   s"    
z)CharSetProber.filter_with_english_letters)N)r	   
__module____qualname__SHORTCUT_THRESHOLDr   r   propertyr   r   r   r   staticmethodr   r$   r-   r   r   r   r   r   #   s   
%r   )r   r   enumsr   objectr   r   r   r   r   <module>   s   