Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
news
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Sartika Aritonang
news
Commits
b6f3c904
Commit
b6f3c904
authored
4 years ago
by
Sartika Aritonang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Upload New File
parent
bbcf61aa
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
233 additions
and
0 deletions
+233
-0
chardistribution.py
...Lib/site-packages/pip/_vendor/chardet/chardistribution.py
+233
-0
No files found.
stbi/Lib/site-packages/pip/_vendor/chardet/chardistribution.py
0 → 100644
View file @
b6f3c904
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Communicator client code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from
.euctwfreq
import
(
EUCTW_CHAR_TO_FREQ_ORDER
,
EUCTW_TABLE_SIZE
,
EUCTW_TYPICAL_DISTRIBUTION_RATIO
)
from
.euckrfreq
import
(
EUCKR_CHAR_TO_FREQ_ORDER
,
EUCKR_TABLE_SIZE
,
EUCKR_TYPICAL_DISTRIBUTION_RATIO
)
from
.gb2312freq
import
(
GB2312_CHAR_TO_FREQ_ORDER
,
GB2312_TABLE_SIZE
,
GB2312_TYPICAL_DISTRIBUTION_RATIO
)
from
.big5freq
import
(
BIG5_CHAR_TO_FREQ_ORDER
,
BIG5_TABLE_SIZE
,
BIG5_TYPICAL_DISTRIBUTION_RATIO
)
from
.jisfreq
import
(
JIS_CHAR_TO_FREQ_ORDER
,
JIS_TABLE_SIZE
,
JIS_TYPICAL_DISTRIBUTION_RATIO
)
class
CharDistributionAnalysis
(
object
):
ENOUGH_DATA_THRESHOLD
=
1024
SURE_YES
=
0.99
SURE_NO
=
0.01
MINIMUM_DATA_THRESHOLD
=
3
def
__init__
(
self
):
# Mapping table to get frequency order from char order (get from
# GetOrder())
self
.
_char_to_freq_order
=
None
self
.
_table_size
=
None
# Size of above table
# This is a constant value which varies from language to language,
# used in calculating confidence. See
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
# for further detail.
self
.
typical_distribution_ratio
=
None
self
.
_done
=
None
self
.
_total_chars
=
None
self
.
_freq_chars
=
None
self
.
reset
()
def
reset
(
self
):
"""reset analyser, clear any state"""
# If this flag is set to True, detection is done and conclusion has
# been made
self
.
_done
=
False
self
.
_total_chars
=
0
# Total characters encountered
# The number of characters whose frequency order is less than 512
self
.
_freq_chars
=
0
def
feed
(
self
,
char
,
char_len
):
"""feed a character with known length"""
if
char_len
==
2
:
# we only care about 2-bytes character in our distribution analysis
order
=
self
.
get_order
(
char
)
else
:
order
=
-
1
if
order
>=
0
:
self
.
_total_chars
+=
1
# order is valid
if
order
<
self
.
_table_size
:
if
512
>
self
.
_char_to_freq_order
[
order
]:
self
.
_freq_chars
+=
1
def
get_confidence
(
self
):
"""return confidence based on existing data"""
# if we didn't receive any character in our consideration range,
# return negative answer
if
self
.
_total_chars
<=
0
or
self
.
_freq_chars
<=
self
.
MINIMUM_DATA_THRESHOLD
:
return
self
.
SURE_NO
if
self
.
_total_chars
!=
self
.
_freq_chars
:
r
=
(
self
.
_freq_chars
/
((
self
.
_total_chars
-
self
.
_freq_chars
)
*
self
.
typical_distribution_ratio
))
if
r
<
self
.
SURE_YES
:
return
r
# normalize confidence (we don't want to be 100% sure)
return
self
.
SURE_YES
def
got_enough_data
(
self
):
# It is not necessary to receive all data to draw conclusion.
# For charset detection, certain amount of data is enough
return
self
.
_total_chars
>
self
.
ENOUGH_DATA_THRESHOLD
def
get_order
(
self
,
byte_str
):
# We do not handle characters based on the original encoding string,
# but convert this encoding string to a number, here called order.
# This allows multiple encodings of a language to share one frequency
# table.
return
-
1
class
EUCTWDistributionAnalysis
(
CharDistributionAnalysis
):
def
__init__
(
self
):
super
(
EUCTWDistributionAnalysis
,
self
)
.
__init__
()
self
.
_char_to_freq_order
=
EUCTW_CHAR_TO_FREQ_ORDER
self
.
_table_size
=
EUCTW_TABLE_SIZE
self
.
typical_distribution_ratio
=
EUCTW_TYPICAL_DISTRIBUTION_RATIO
def
get_order
(
self
,
byte_str
):
# for euc-TW encoding, we are interested
# first byte range: 0xc4 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char
=
byte_str
[
0
]
if
first_char
>=
0xC4
:
return
94
*
(
first_char
-
0xC4
)
+
byte_str
[
1
]
-
0xA1
else
:
return
-
1
class
EUCKRDistributionAnalysis
(
CharDistributionAnalysis
):
def
__init__
(
self
):
super
(
EUCKRDistributionAnalysis
,
self
)
.
__init__
()
self
.
_char_to_freq_order
=
EUCKR_CHAR_TO_FREQ_ORDER
self
.
_table_size
=
EUCKR_TABLE_SIZE
self
.
typical_distribution_ratio
=
EUCKR_TYPICAL_DISTRIBUTION_RATIO
def
get_order
(
self
,
byte_str
):
# for euc-KR encoding, we are interested
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char
=
byte_str
[
0
]
if
first_char
>=
0xB0
:
return
94
*
(
first_char
-
0xB0
)
+
byte_str
[
1
]
-
0xA1
else
:
return
-
1
class
GB2312DistributionAnalysis
(
CharDistributionAnalysis
):
def
__init__
(
self
):
super
(
GB2312DistributionAnalysis
,
self
)
.
__init__
()
self
.
_char_to_freq_order
=
GB2312_CHAR_TO_FREQ_ORDER
self
.
_table_size
=
GB2312_TABLE_SIZE
self
.
typical_distribution_ratio
=
GB2312_TYPICAL_DISTRIBUTION_RATIO
def
get_order
(
self
,
byte_str
):
# for GB2312 encoding, we are interested
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char
,
second_char
=
byte_str
[
0
],
byte_str
[
1
]
if
(
first_char
>=
0xB0
)
and
(
second_char
>=
0xA1
):
return
94
*
(
first_char
-
0xB0
)
+
second_char
-
0xA1
else
:
return
-
1
class
Big5DistributionAnalysis
(
CharDistributionAnalysis
):
def
__init__
(
self
):
super
(
Big5DistributionAnalysis
,
self
)
.
__init__
()
self
.
_char_to_freq_order
=
BIG5_CHAR_TO_FREQ_ORDER
self
.
_table_size
=
BIG5_TABLE_SIZE
self
.
typical_distribution_ratio
=
BIG5_TYPICAL_DISTRIBUTION_RATIO
def
get_order
(
self
,
byte_str
):
# for big5 encoding, we are interested
# first byte range: 0xa4 -- 0xfe
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char
,
second_char
=
byte_str
[
0
],
byte_str
[
1
]
if
first_char
>=
0xA4
:
if
second_char
>=
0xA1
:
return
157
*
(
first_char
-
0xA4
)
+
second_char
-
0xA1
+
63
else
:
return
157
*
(
first_char
-
0xA4
)
+
second_char
-
0x40
else
:
return
-
1
class
SJISDistributionAnalysis
(
CharDistributionAnalysis
):
def
__init__
(
self
):
super
(
SJISDistributionAnalysis
,
self
)
.
__init__
()
self
.
_char_to_freq_order
=
JIS_CHAR_TO_FREQ_ORDER
self
.
_table_size
=
JIS_TABLE_SIZE
self
.
typical_distribution_ratio
=
JIS_TYPICAL_DISTRIBUTION_RATIO
def
get_order
(
self
,
byte_str
):
# for sjis encoding, we are interested
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
# no validation needed here. State machine has done that
first_char
,
second_char
=
byte_str
[
0
],
byte_str
[
1
]
if
(
first_char
>=
0x81
)
and
(
first_char
<=
0x9F
):
order
=
188
*
(
first_char
-
0x81
)
elif
(
first_char
>=
0xE0
)
and
(
first_char
<=
0xEF
):
order
=
188
*
(
first_char
-
0xE0
+
31
)
else
:
return
-
1
order
=
order
+
second_char
-
0x40
if
second_char
>
0x7F
:
order
=
-
1
return
order
class
EUCJPDistributionAnalysis
(
CharDistributionAnalysis
):
def
__init__
(
self
):
super
(
EUCJPDistributionAnalysis
,
self
)
.
__init__
()
self
.
_char_to_freq_order
=
JIS_CHAR_TO_FREQ_ORDER
self
.
_table_size
=
JIS_TABLE_SIZE
self
.
typical_distribution_ratio
=
JIS_TYPICAL_DISTRIBUTION_RATIO
def
get_order
(
self
,
byte_str
):
# for euc-JP encoding, we are interested
# first byte range: 0xa0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
char
=
byte_str
[
0
]
if
char
>=
0xA0
:
return
94
*
(
char
-
0xA1
)
+
byte_str
[
1
]
-
0xa1
else
:
return
-
1
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment