Viewing File: /opt/hc_python/lib/python3.12/site-packages/charset_normalizer/__pycache__/md.cpython-312.pyc

�

`��gDN���ddlmZddlmZddlmZddlmZmZm	Z	ddl
mZmZm
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZmZGd�d�ZGd	�d
e�ZGd�de�ZGd
�de�ZGd�de�Z Gd�de�Z!Gd�de�Z"Gd�de�Z#Gd�de�Z$Gd�de�Z%ed��						d!d��Z&ed��	d"							d#d��Z'y )$�)�annotations)�	lru_cache)�	getLogger�)�COMMON_SAFE_ASCII_CHARACTERS�TRACE�UNICODE_SECONDARY_RANGE_KEYWORD)�is_accentuated�	is_arabic�is_arabic_isolated_form�is_case_variable�is_cjk�is_emoticon�	is_hangul�is_hiragana�is_katakana�is_latin�is_punctuation�is_separator�	is_symbol�is_thai�is_unprintable�
remove_accent�
unicode_rangec�:�eZdZdZdd�Zdd�Zd	d�Zed
d��Zy)�MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    c��t�)z@
        Determine if given character should be fed in.
        ��NotImplementedError��self�	characters  �F/opt/hc_python/lib64/python3.12/site-packages/charset_normalizer/md.py�eligiblezMessDetectorPlugin.eligible&�
��"�!�c��t�)z�
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        rr s  r#�feedzMessDetectorPlugin.feed,s
��
"�!r&c��t�)zB
        Permit to reset the plugin to the initial state.
        r�r!s r#�resetzMessDetectorPlugin.reset3r%r&c��t�)z�
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        rr*s r#�ratiozMessDetectorPlugin.ratio9s
��"�!r&N�r"�str�return�bool�r"r/r0�None�r0r3�r0�float)	�__name__�
__module__�__qualname__�__doc__r$r(r+�propertyr-�r&r#rr s*���
"�"�"��"��"r&rc�>�eZdZdd�Zdd�Zd	d�Zdd�Zed
d��Zy)� TooManySymbolOrPunctuationPluginc�J�d|_d|_d|_d|_d|_y)NrF)�_punctuation_count�
_symbol_count�_character_count�_last_printable_char�_frenzy_symbol_in_wordr*s r#�__init__z)TooManySymbolOrPunctuationPlugin.__init__Cs*��'(���"#���%&���04��!�,1��#r&c�"�|j�S�N��isprintabler s  r#r$z)TooManySymbolOrPunctuationPlugin.eligibleK����$�$�&�&r&c�8�|xjdz
c_||jk7ro|tvrgt|�r|xjdz
c_||_y|j�dur-t
|�r"t|�dur|xjdz
c_||_y)NrF�)	rBrCrrr@�isdigitrrrAr s  r#r(z%TooManySymbolOrPunctuationPlugin.feedNs�������"��
��2�2�2��!=�=��i�(��'�'�1�,�'�%.��!�
�!�!�#�u�,��i�(��	�*�e�3��"�"�a�'�"�$-��!r&c�.�d|_d|_d|_y�Nr)r@rBrAr*s r#r+z&TooManySymbolOrPunctuationPlugin.reset`s��"#��� !�����r&c��|jdk(ry|j|jz|jz}|dk\r|SdS)Nr��333333�?)rBr@rA)r!�ratio_of_punctuations  r#r-z&TooManySymbolOrPunctuationPlugin.ratioesO��� � �A�%��
�#�#�d�&8�&8�8��!�!�'"��(<�s�'B�#�K��Kr&Nr4r.r2r5�	r7r8r9rEr$r(r+r;r-r<r&r#r>r>Bs,��2�'�.�$�
�L��Lr&r>c�>�eZdZdd�Zdd�Zd	d�Zdd�Zed
d��Zy)�TooManyAccentuatedPluginc� �d|_d|_yrO�rB�_accentuated_countr*s r#rEz!TooManyAccentuatedPlugin.__init__rs��%&���'(��r&c�"�|j�SrG)�isalphar s  r#r$z!TooManyAccentuatedPlugin.eligiblevs��� � �"�"r&c�p�|xjdz
c_t|�r|xjdz
c_yy�Nr)rBr
rYr s  r#r(zTooManyAccentuatedPlugin.feedys1������"���)�$��#�#�q�(�#�%r&c� �d|_d|_yrOrXr*s r#r+zTooManyAccentuatedPlugin.resets�� !���"#��r&c�f�|jdkry|j|jz}|dk\r|SdS)N�rQgffffff�?rX)r!�ratio_of_accentuations  r#r-zTooManyAccentuatedPlugin.ratio�s=��� � �1�$��'+�'>�'>��AV�AV�'V��(=��(E�$�N�3�Nr&Nr4r.r2r5rTr<r&r#rVrVqs,��)�#�)�$��O��Or&rVc�>�eZdZdd�Zdd�Zd	d�Zdd�Zed
d��Zy)�UnprintablePluginc� �d|_d|_yrO)�_unprintable_countrBr*s r#rEzUnprintablePlugin.__init__�s��'(���%&��r&c��y�NTr<r s  r#r$zUnprintablePlugin.eligible����r&c�n�t|�r|xjdz
c_|xjdz
c_yr])rrerBr s  r#r(zUnprintablePlugin.feed�s,���)�$��#�#�q�(�#�����"�r&c��d|_yrO)rer*s r#r+zUnprintablePlugin.reset�s
��"#��r&c�Z�|jdk(ry|jdz|jzS)NrrQr`)rBrer*s r#r-zUnprintablePlugin.ratio�s/��� � �A�%���'�'�!�+�t�/D�/D�D�Dr&Nr4r.r2r5rTr<r&r#rcrc�s,��'��#�
$��E��Er&rcc�>�eZdZdd�Zdd�Zd	d�Zdd�Zed
d��Zy)�SuspiciousDuplicateAccentPluginc�.�d|_d|_d|_yrO��_successive_countrB�_last_latin_characterr*s r#rEz(SuspiciousDuplicateAccentPlugin.__init__�s��&'���%&���15��"r&c�<�|j�xrt|�SrG)r[rr s  r#r$z(SuspiciousDuplicateAccentPlugin.eligible�s��� � �"�:�x�	�':�:r&c�~�|xjdz
c_|j��t|�r�t|j�ru|j�r/|jj�r|xjdz
c_t|�t|j�k(r|xjdz
c_||_yr])rBrqr
�isupperrprr s  r#r(z$SuspiciousDuplicateAccentPlugin.feed�s�������"���&�&�2��y�)��t�9�9�:�� � �"�t�'A�'A�'I�'I�'K��&�&�!�+�&��Y�'�=��9S�9S�+T�T��&�&�!�+�&�%.��"r&c�.�d|_d|_d|_yrOror*s r#r+z%SuspiciousDuplicateAccentPlugin.reset�s��!"��� !���%)��"r&c�Z�|jdk(ry|jdz|jzS)NrrQrL)rBrpr*s r#r-z%SuspiciousDuplicateAccentPlugin.ratio�s/��� � �A�%���&�&��*�d�.C�.C�C�Cr&Nr4r.r2r5rTr<r&r#rmrm�s,��6�;�/�*�
�D��Dr&rmc�>�eZdZdd�Zdd�Zd	d�Zdd�Zed
d��Zy)�SuspiciousRangec�.�d|_d|_d|_yrO)�"_suspicious_successive_range_countrB�_last_printable_seenr*s r#rEzSuspiciousRange.__init__�s��78��/�%&���04��!r&c�"�|j�SrGrHr s  r#r$zSuspiciousRange.eligible�rJr&c�<�|xjdz
c_|j�st|�s|tvrd|_y|j�||_yt|j�}t|�}t
||�r|xjdz
c_||_yr])rB�isspacerrr{r� is_suspiciously_successive_rangerz)r!r"�unicode_range_a�unicode_range_bs    r#r(zSuspiciousRange.feed�s�������"��
�����i�(��8�8�(,�D�%���$�$�,�(1�D�%��&3�D�4M�4M�&N��&3�I�&>��+�O�_�M��3�3�q�8�3�$-��!r&c�.�d|_d|_d|_yrO)rBrzr{r*s r#r+zSuspiciousRange.reset�s�� !���23��/�$(��!r&c�^�|jdkry|jdz|jz}|S)N�
rQrL)rBrz)r!�ratio_of_suspicious_range_usages  r#r-zSuspiciousRange.ratio�s<��� � �B�&��
�3�3�a�7��!�!�2"�'�/�.r&Nr4r.r2r5rTr<r&r#rxrx�s*��5�
'�.�.)�
�/��/r&rxc�>�eZdZdd�Zdd�Zd	d�Zdd�Zed
d��Zy)�SuperWeirdWordPluginc��d|_d|_d|_d|_d|_d|_d|_d|_d|_d|_	y)NrF�)
�_word_count�_bad_word_count�_foreign_long_count�_is_current_word_bad�_foreign_long_watchrB�_bad_character_count�_buffer�_buffer_accent_count�_buffer_glyph_countr*s r#rEzSuperWeirdWordPlugin.__init__�sQ�� !���$%���()�� �*/��!�).�� �%&���)*��!����)*��!�()�� r&c��yrgr<r s  r#r$zSuperWeirdWordPlugin.eligible
rhr&c���|j�r�|xj|z
c_t|�r|xjdz
c_|jdur`t|�dust|�rHt
|�dur;t|�dur.t|�dur!t|�durt|�durd|_t
|�s,t|�s!t|�st|�st|�r|xjdz
c_y|jsy|j�st|�st|��r�|j�r�|xjdz
c_t!|j�}|xj"|z
c_|dk\r�|j|zdk\rd|_n�t|jd�rX|jdj'�r;t)d�|jD��dur|xj*dz
c_d|_n+|jdk(rd|_|xj*dz
c_|dk\r�|jrwt-|jt/d	|��D��cgc]\}}|j'�r|��}}}d}|rt!|�|zd
krd}|s|xj*dz
c_d|_|j$rD|xj0dz
c_|xj2t!|j�z
c_d|_d|_d|_d	|_d	|_y|dvr<|j5�dur)t7|�rd|_|xj|z
c_yyyycc}}w)
NrFT�g�?���c3�<K�|]}|j����y�wrG)rt)�.0�_s  r#�	<genexpr>z,SuperWeirdWordPlugin.feed.<locals>.<genexpr>7s����>��A�A�I�I�K��s��rrRr�>�-�<�=�>r��|�~)r[r�r
r�r�rrrrrrr�r~rrr��lenrBr�rt�allr��zip�ranger�r�rMr)r!r"�
buffer_length�c�i�camel_case_dst�probable_camel_caseds       r#r(zSuperWeirdWordPlugin.feed
s�������L�L�I�%�L��i�(��)�)�Q�.�)��(�(�E�1��i�(�E�1�^�I�5N��9�%��.��i�(�E�1��	�*�e�3��	�*�e�3��I�&�%�/�+/��(��y�!��Y�'��y�)��y�)��9�%��(�(�A�-�(���|�|������>�)�#<��Y�@W��l�l�����!��!$�T�\�\�!2�M��!�!�]�2�!���!��,�,�}�<��C�04�D�-�#�4�<�<��#3�4����R�(�0�0�2��>����>�>�%�G��,�,��1�,�04�D�-��-�-��2�04�D�-��,�,��1�,���"�t�'?�'?�!$�D�L�L�%��=�2I� J�"� J���1��y�y�{�� J��"�
.3�$�!�s�>�':�]�'J�c�'Q�+/�(�+��,�,��1�,�04�D�-��(�(��$�$��)�$��)�)�S����->�>�)�,1��)�',�D�$��D�L�()�D�%�'(�D�$��@�@��!�!�#�u�,��)�$�(,�D�%��L�L�I�%�L�%�-�
A��1"s�/M1c�t�d|_d|_d|_d|_d|_d|_d|_d|_y)Nr�Fr)r�r�r�r�r�rBr�r�r*s r#r+zSuperWeirdWordPlugin.reset^sA�����$)��!�#(�� � ������ !���$%��!�#$�� r&c�r�|jdkr|jdk(ry|j|jzS)N�
rrQ)r�r�r�rBr*s r#r-zSuperWeirdWordPlugin.ratiohs7�����r�!�d�&>�&>�!�&C���(�(�4�+@�+@�@�@r&Nr4r.r2r5rTr<r&r#r�r��s.��
*��O&�b%��A��Ar&r�c�B�eZdZdZdd�Zd	d�Zd
d�Zdd�Zedd��Z	y)�CjkInvalidStopPluginu�
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    c� �d|_d|_yrO��_wrong_stop_count�_cjk_character_countr*s r#rEzCjkInvalidStopPlugin.__init__vs��&'���)*��!r&c��yrgr<r s  r#r$zCjkInvalidStopPlugin.eligiblezrhr&c�z�|dvr|xjdz
c_yt|�r|xjdz
c_yy)N>�丄�丅r)r�rr�r s  r#r(zCjkInvalidStopPlugin.feed}s<����&��"�"�a�'�"���)���%�%��*�%�r&c� �d|_d|_yrOr�r*s r#r+zCjkInvalidStopPlugin.reset�s��!"���$%��!r&c�T�|jdkry|j|jzS)N�rQ)r�r�r*s r#r-zCjkInvalidStopPlugin.ratio�s*���$�$�r�)���%�%��(A�(A�A�Ar&Nr4r.r2r5)
r7r8r9r:rEr$r(r+r;r-r<r&r#r�r�ps1���
+��+�&��B��Br&r�c�>�eZdZdd�Zdd�Zd	d�Zdd�Zed
d��Zy)�ArchaicUpperLowerPluginc�f�d|_d|_d|_d|_d|_d|_d|_y)NFrT)�_buf�_character_count_since_last_sep�_successive_upper_lower_count�#_successive_upper_lower_count_finalrB�_last_alpha_seen�_current_ascii_onlyr*s r#rEz ArchaicUpperLowerPlugin.__init__�s9����	�45��,�23��*�89��0�%&���,0���)-�� r&c��yrgr<r s  r#r$z ArchaicUpperLowerPlugin.eligible�rhr&c�Z�|j�xrt|�}|du}|r�|jdkDr�|jdkr?|j�dur-|jdur|xj
|jz
c_d|_d|_d|_d|_|xjdz
c_	d|_y|jdur|j�durd|_|j��|j�r|jj�s*|j�rM|jj�r3|jdur|xjdz
c_d|_nd|_nd|_|xjdz
c_	|xjdz
c_||_y)NFr�@rTrL)
r[r
r�rMr�r�r�r�r�rB�isasciirt�islower)r!r"�is_concerned�	chunk_seps    r#r(zArchaicUpperLowerPlugin.feed�s��� �(�(�*�J�/?�	�/J�� �E�)�	���=�=��A��4�4��:��%�%�'�5�0��,�,��5��8�8��6�6��8�23�D�.�34�D�0�$(�D�!��D�I��!�!�Q�&�!�'+�D�$���#�#�t�+�	�0A�0A�0C�u�0L�',�D�$�� � �,��!�!�#��(=�(=�(E�(E�(G��!�!�#��(=�(=�(E�(E�(G��9�9��$��6�6�!�;�6� %�D�I� $�D�I�!��	�����"���,�,��1�,� )��r&c�f�d|_d|_d|_d|_d|_d|_d|_y)NrFT)rBr�r�r�r�r�r�r*s r#r+zArchaicUpperLowerPlugin.reset�s9�� !���/0��,�-.��*�34��0� $�����	�#'�� r&c�T�|jdk(ry|j|jzS)NrrQ)rBr�r*s r#r-zArchaicUpperLowerPlugin.ratio�s*��� � �A�%���7�7�$�:O�:O�O�Or&Nr4r.r2r5rTr<r&r#r�r��s-��.��(*�T(��P��Pr&r�c�>�eZdZdd�Zdd�Zdd�Zd	d�Zed
d��Zy)�ArabicIsolatedFormPluginc� �d|_d|_yrO�rB�_isolated_form_countr*s r#rEz!ArabicIsolatedFormPlugin.__init__�s��%&���)*��!r&c� �d|_d|_yrOr�r*s r#r+zArabicIsolatedFormPlugin.reset�s�� !���$%��!r&c��t|�SrG)rr s  r#r$z!ArabicIsolatedFormPlugin.eligible�s
����#�#r&c�p�|xjdz
c_t|�r|xjdz
c_yyr])rBrr�r s  r#r(zArabicIsolatedFormPlugin.feed�s1������"��"�9�-��%�%��*�%�.r&c�X�|jdkry|j|jz}|S)Nr`rQr�)r!�isolated_form_usages  r#r-zArabicIsolatedFormPlugin.ratio�s0��� � �1�$��%)�%>�%>��AV�AV�%V��"�"r&Nr4r.r2r5)	r7r8r9rEr+r$r(r;r-r<r&r#r�r��s*��+�&�$�+��#��#r&r��)�maxsizec��|�|�y||k(ryd|vrd|vryd|vsd|vryd|vsd|vr	d|vsd|vry|jd�|jd�}}|D]}|tvr�||vs�y|dv|dv}}|s|r	d|vsd|vry|r|ryd	|vsd	|vrd|vsd|vry|d
k(s|d
k(ryd|vsd|vs|dvr!|dvrd|vsd|vryd
|vsd
|vry|d
k(s|d
k(ryy)za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    TF�Latin�	Emoticons�	Combining� )�Hiragana�Katakana�CJK�HangulzBasic Latin)r�r��Punctuation�Forms)�splitr	)r�r��keywords_range_a�keywords_range_b�el�range_a_jp_chars�range_b_jp_charss       r#rr�s�����/�"9���/�)���/�!�g��&@���o�%���)G��	�?�"�g��&@���&�+��*H��	���c�"����c�"�'��
��
�0�0��
�!�!��	�	�
�	
�
	�3�3�
'��	�,�
�� �E�_�$<���,���?�"�h�/�&A��O�#�u��'?���m�+��-�/O��	�� �E�_�$<��3�3��7�7��O�+�}��/O���o�%��O�)C���m�+��-�/O��r&ic	��tj�D�cgc]	}|���}}t|�dz}d}|dkrd}n
|dkrd}nd}t|dzt	|��D]^\}}	|D]%}
|
j|�s�|
j
|��'|	d	kDr|	|zd	k(s	|	|dz
k(s�Ftd
�|D��}||k\s�^n|r�td�}|jtd|�d
|�d|���t|�dkDr8|jtd|dd���|jtd|dd���|D]1}|jt|j�d|j����3t|d�Scc}w)zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    rrQi� r�r���
rc3�4K�|]}|j���y�wrG)r-)r��dts  r#r�zmess_ratio.<locals>.<genexpr>`s����!?�Y�r�"�(�(�Y�s��charset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=r�zStarting with: Nz
Ending with: i�z: �)r�__subclasses__r�r�r�r$r(�sumr�logr�	__class__r-�round)
�decoded_sequence�maximum_threshold�debug�md_class�	detectors�length�mean_mess_ratio�!intermediary_mean_mess_ratio_calcr"�index�detector�loggerr�s
             r#�
mess_ratiorAs���$6�#D�#D�#F�+�#F�x��
�#F��+��&�'�!�+�F� �O�
��|�13�)�	�4��,.�)�,/�)�� 0�4� 7��v��G��	�5�!�H�� � ��+��
�
�i�(�"�

�A�I�%�"C�C�q�H�
�f�q�j�
 �!�!?�Y�!?�?�O��"3�3��H�
��/�0���
�
��
1�1R�0S�Sd�et�du�v!�!2� 3�
5�	
��� �2�%��J�J�u��0@��"�0E�/F�G�H��J�J�u�
�.>�s�u�.E�-F�G�H��B��J�J�u�����b����
�;�<����!�$�$��[+s�E6N)r��
str | Noner�rr0r1)g�������?F)r�r/r�r6r�r1r0r6)(�
__future__r�	functoolsr�loggingr�constantrrr	�utilsr
rrr
rrrrrrrrrrrrrrr>rVrcrmrxr�r�r�r�rrr<r&r#�<module>r	s;��"�����
�����*"�"�D,L�'9�,L�^O�1�O�6E�*�E�0"D�&8�"D�J./�(�./�bsA�-�sA�lB�-�B�>IP�0�IP�X#�1�#�8�4��F��F�2<�F�	�F��F�R�4��IN�4%��4%�.3�4%�BF�4%�
�4%��4%r&
Back to Directory File Manager