33#define MAX_BUFFER 16*1024
62 switch (codec->mibEnum())
74class KEncodingDetectorPrivate
78 QTextDecoder *m_decoder;
79 QTextCodec *m_defaultCodec;
80 QByteArray m_storeDecoderName;
87 bool m_writtingHappened : 1;
88 bool m_analyzeCalled : 1;
91 QByteArray m_bufferForDefferedEncDetection;
93 KEncodingDetectorPrivate()
94 : m_codec(QTextCodec::codecForMib(
MibLatin1))
95 , m_decoder(m_codec->makeDecoder())
96 , m_defaultCodec(m_codec)
97 , m_source(KEncodingDetector::DefaultEncoding)
98 , m_autoDetectLanguage(KEncodingDetector::SemiautomaticDetection)
101 , m_writtingHappened(false)
102 , m_analyzeCalled(false)
109 , m_decoder(m_codec->makeDecoder())
110 , m_defaultCodec(m_codec)
112 , m_autoDetectLanguage(script)
115 , m_writtingHappened(false)
116 , m_analyzeCalled(false)
121 ~KEncodingDetectorPrivate()
127 bool isExplicitlySpecifiedEncoding()
136 for (
int i = 0; i < size; ++i ) {
137 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
138 || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA )
139 || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
140 || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) {
150 for (
int i = 0; i < size; ++i ) {
151 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) )
154 if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 )
155 return "iso-8859-13";
158 return "iso-8859-13";
163 QByteArray charset = QByteArray();
164 for (
int i = 0; i < size; ++i ) {
165 if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {
166 if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 )
176 if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) {
180 if ( charset.isNull() )
181 charset =
"iso-8859-2";
187 if ( charset.isNull() )
188 charset =
"iso-8859-3";
190 return charset.data();
196 kWarning() <<
"KEncodingDetector: Cyr heuristics";
213 int cp1251_o_capital=0;
218 int cp1251_a_capital=0;
223 int cp1251_s_capital=0;
228 int cp1251_i_capital=0;
231 int cp1251_small_range=0;
232 int koi_small_range=0;
233 int ibm866_small_range=0;
236 for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i)
240 ++cp1251_small_range;
244 else if (ptr[i]==0xe0)
246 else if (ptr[i]==0xe8)
248 else if (ptr[i]==0xf1)
250 else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)
253 else if (ptr[i]==0xef)
255 else if (ptr[i]==0xe1)
257 else if (ptr[i]==0xe9)
259 else if (ptr[i]==0xf3)
263 else if (ptr[i]>0xbf)
267 if (ptr[i]==0xd0||ptr[i]==0xd1)
269 else if (ptr[i]==0xcf)
271 else if (ptr[i]==0xc1)
273 else if (ptr[i]==0xc9)
275 else if (ptr[i]==0xd3)
277 else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)
280 else if (ptr[i]==0xce)
282 else if (ptr[i]==0xc0)
284 else if (ptr[i]==0xc8)
286 else if (ptr[i]==0xd1)
289 else if (ptr[i]>0x9f && ptr[i]<0xb0)
290 ++ibm866_small_range;
295 if (cp1251_small_range+koi_small_range+ibm866_small_range<8)
300 if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range)
303 kWarning() <<
"Cyr Enc Detection: UTF8";
308 if (ibm866_small_range>cp1251_small_range+koi_small_range)
314 if (cp1251_st==0 && koi_st>1)
316 else if (koi_st==0 && cp1251_st>1)
319 if (cp1251_st && koi_st)
321 if (cp1251_st/koi_st>2)
323 else if (koi_st/cp1251_st>2)
329 else if (cp1251_a || koi_a)
334 else if (cp1251_o || koi_o)
339 else if (cp1251_i || koi_i)
344 else if (cp1251_s || koi_s)
347 if (cp1251_a_capital>koi_a_capital)
349 else if (cp1251_a_capital || koi_a_capital)
352 if (cp1251_o_capital>koi_o_capital)
354 else if (cp1251_o_capital || koi_o_capital)
357 if (cp1251_i_capital>koi_i_capital)
359 else if (cp1251_i_capital || koi_i_capital)
362 if (cp1251_s_capital>koi_s_capital)
364 else if (cp1251_s_capital || koi_s_capital)
367 kWarning()<<
"koi_score " << koi_score <<
" cp1251_score " << cp1251_score;
369 if (abs(koi_score-cp1251_score)<10)
372 cp1251_score=cp1251_small_range;
373 koi_score=koi_small_range;
375 if (cp1251_score>koi_score)
391 for (
int i = 0; i < size; ++i ) {
392 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
393 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
394 || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) {
404 for (
int i = 0; i < size; ++i ) {
405 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B
406 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 )
407 || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) {
411 if ( ptr[ i ] == 0xDF )
412 return "iso-8859-8-i";
415 return "iso-8859-8-i";
422 switch ( kc.
guess_jp( (
const char*)ptr, size ) ) {
440 for (
int i = 0; i < size; ++i ) {
441 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) {
452 uint nonansi_count=0;
453 for (
int i=0; i<size; ++i)
458 if ( ptr[i]>0xc1 && ptr[i]<0xf0 && ptr[i+1]>0x7f && ptr[i+1]<0xc0)
462 if (ptr[i] >= 0x78 && ptr[i]<=0x9F )
471 return "iso-8859-15";
493 if (p[1]==
'-' && p[2]==
'>')
499 if (p[1] ==
'-' && p[2] ==
'!' && p[3] ==
'>')
514 int len = str.length();
515 int pos = str.indexOf(
"encoding");
521 while (pos<len && str[pos]<=
' ')
526 if (pos>=len || str[pos] !=
'=')
531 while (pos<len && str[pos]<=
' ')
539 char quoteMark = str[pos];
540 if (quoteMark !=
'"' && quoteMark !=
'\'')
546 while (end<len && str[end]!=quoteMark)
552 encodingLength = end-pos;
561 for (
int i=1; i < len; i+=2)
563 if ((data[i]==
'\0') && (data[i-1]==
'\0'))
587 if (d->m_codec->mibEnum()!=
MibUtf8)
594static const unsigned char highest1Bits = 0x80;
595static const unsigned char highest2Bits = 0xC0;
596static const unsigned char highest3Bits = 0xE0;
597static const unsigned char highest4Bits = 0xF0;
598static const unsigned char highest5Bits = 0xF8;
600 for (
int i=0; i<length; ++i)
602 unsigned char c = data[i];
604 if (d->m_multiByte>0)
606 if ((c & highest2Bits) == 0x80)
612 kWarning() <<
"EncDetector: Broken UTF8";
618 if ((c & highest1Bits) == 0x00)
622 if ((c & highest3Bits) == 0xC0)
629 if ((c & highest4Bits) == 0xE0)
636 if ((c & highest5Bits) == 0xF0)
642 kWarning() <<
"EncDetector:_Broken UTF8";
655 d(new KEncodingDetectorPrivate(codec,source,script))
666 d->m_autoDetectLanguage=lang;
670 return d->m_autoDetectLanguage;
680 d->m_storeDecoderName = d->m_codec->name();
681 return d->m_storeDecoderName.constData();
686 return d->m_visualRTL;
701 assert(d->m_defaultCodec);
702 d->m_bufferForDefferedEncDetection.clear();
703 d->m_writtingHappened =
false;
704 d->m_analyzeCalled =
false;
708 d->m_codec = d->m_defaultCodec;
709 d->m_decoder = d->m_codec->makeDecoder();
715 QByteArray enc(_encoding);
719 codec=d->m_defaultCodec;
737 if (d->m_codec->mibEnum()==codec->mibEnum())
755 codec = QTextCodec::codecForName(
"iso8859-8-i");
758 if(!(enc==
"iso-8859-8-i"||enc==
"iso_8859-8-i"||enc==
"csiso88598i"||enc==
"logical"))
759 d->m_visualRTL =
true;
765 d->m_decoder = d->m_codec->makeDecoder();
767 kDebug(6005) <<
"KEncodingDetector::encoding used is" << d->m_codec->name();
775 if (!d->m_analyzeCalled)
778 d->m_analyzeCalled=
true;
781 return d->m_decoder->toUnicode(data,len);
786 processNull(
const_cast<char *
>(data.data()),data.size());
787 if (!d->m_analyzeCalled)
789 analyze(data.data(),data.size());
790 d->m_analyzeCalled=
true;
793 return d->m_decoder->toUnicode(data);
799 kWarning() <<
"KEncodingDetector: decoding "<<len<<
" bytes";
801 if (d->m_writtingHappened)
804 kWarning() <<
"KEncodingDetector: d->m_writtingHappened "<< d->m_codec->name();
807 return d->m_decoder->toUnicode(data, len);
811 if (d->m_bufferForDefferedEncDetection.isEmpty())
815 if (
analyze(data,len) && (d->m_seenBody || d->isExplicitlySpecifiedEncoding()))
818 kWarning() <<
"KEncodingDetector: m_writtingHappened first time "<< d->m_codec->name();
821 d->m_writtingHappened=
true;
822 return d->m_decoder->toUnicode(data, len);
827 kWarning() <<
"KEncodingDetector: begin deffer";
829 d->m_bufferForDefferedEncDetection=data;
834 d->m_bufferForDefferedEncDetection+=data;
837 bool detected =
analyze(d->m_bufferForDefferedEncDetection.constData(), d->m_bufferForDefferedEncDetection.length());
838 if ((detected && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) ||
839 d->m_bufferForDefferedEncDetection.length() >
MAX_BUFFER)
841 d->m_writtingHappened=
true;
842 d->m_bufferForDefferedEncDetection.replace(
'\0',
' ');
843 QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
844 d->m_bufferForDefferedEncDetection.clear();
846 kWarning() <<
"KEncodingDetector: m_writtingHappened in the middle " << d->m_codec->name();
858 return d->m_decoder ? d->m_decoder->hasFailure() :
false;
863 if (d->m_bufferForDefferedEncDetection.isEmpty())
866 d->m_bufferForDefferedEncDetection.replace(
'\0',
' ');
867 QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
868 d->m_bufferForDefferedEncDetection.clear();
870 kWarning() <<
"KEncodingDetector:flush() "<< d->m_bufferForDefferedEncDetection.length()<<
" bytes "<< d->m_codec->name();
883 const uchar *udata = (
const uchar *)data;
889 const char *autoDetectedEncoding;
890 if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE))
892 autoDetectedEncoding =
"UTF-16";
894 else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
896 autoDetectedEncoding =
"UTF-8";
898 else if (c1 == 0x00 || c2 == 0x00)
906 uchar c10 = *udata++;
908 int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
909 int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
910 if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0))
911 autoDetectedEncoding =
"UTF-16";
913 autoDetectedEncoding = 0;
917 autoDetectedEncoding = 0;
921 if (autoDetectedEncoding != 0)
924 d->m_codec = QTextCodec::codecForName(autoDetectedEncoding);
928 d->m_decoder = d->m_codec->makeDecoder();
932 if (
is16Bit(d->m_codec) && c2==0x00)
935 char reverseUtf16[3] = {(char)0xFF, (
char)0xFE, 0x00};
936 d->m_decoder->toUnicode(reverseUtf16, 2);
946 kWarning() <<
"KEncodingDetector: UserChosenEncoding exit ";
963 const char *ptr = data;
964 const char *pEnd = data+len;
975 if (ptr[0] ==
'!' && ptr[1] ==
'-' && ptr[2] ==
'-')
983 if (ptr[0]==
'?' && ptr[1]==
'x' && ptr[2]==
'm' && ptr[3]==
'l')
985 const char *end = ptr;
986 while (*end !=
'>' && end < pEnd)
988 if (*end ==
'\0' || end == pEnd)
990 QByteArray str(ptr, end - ptr);
1002 !(((*ptr >=
'a') && (*ptr <=
'z')) ||
1003 ((*ptr >=
'A') && (*ptr <=
'Z')))
1010 const char* max=ptr+4;
1014 (((*ptr >=
'a') && (*ptr <=
'z')) ||
1015 ((*ptr >=
'A') && (*ptr <=
'Z')) ||
1016 ((*ptr >=
'0') && (*ptr <=
'9')))
1020 tmp[length] = tolower( *ptr );
1025 if (tmp[0]==
'm'&&tmp[1]==
'e'&&tmp[2]==
't'&&tmp[3]==
'a')
1028 const char* end = ptr;
1029 while(*end !=
'>' && *end !=
'\0' && end<pEnd)
1032 QByteArray str( ptr, (end-ptr)+1);
1033 str = str.toLower();
1034 const int strLength = str.length();
1038 if( (pos = str.indexOf(
"charset")) == -1)
1042 if( (pos = str.indexOf(
"=", pos)) == -1)
1049 while (pos < strLength && str[pos] <=
' ')
1053 if (pos < strLength && (str[pos] ==
'"' || str[pos] ==
'\''))
1057 while (pos < strLength && str[pos] <=
' ')
1060 if ( pos == strLength)
1064 while( endpos < strLength &&
1065 (str[endpos] !=
' ' && str[endpos] !=
'"' && str[endpos] !=
'\''
1066 && str[endpos] !=
';' && str[endpos] !=
'>') )
1069 kDebug( 6005 ) <<
"KEncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data();
1074 else if (tmp[0]==
'b'&&tmp[1]==
'o'&&tmp[2]==
'd'&&tmp[3]==
'y')
1086 kDebug( 6005 ) <<
"KEncodingDetector: using heuristics (" << strlen(data) <<
")";
1089 switch ( d->m_autoDetectLanguage)
1118 else if (d->m_defaultCodec->mibEnum()==
MibLatin1)
1149 else if (lang==
i18nc(
"@item Text character set",
"Unicode"))
1151 else if (lang==
i18nc(
"@item Text character set",
"Cyrillic"))
1153 else if (lang==
i18nc(
"@item Text character set",
"Western European"))
1155 else if (lang==
i18nc(
"@item Text character set",
"Central European"))
1157 else if (lang==
i18nc(
"@item Text character set",
"Greek"))
1159 else if (lang==
i18nc(
"@item Text character set",
"Hebrew"))
1161 else if (lang==
i18nc(
"@item Text character set",
"Turkish"))
1163 else if (lang==
i18nc(
"@item Text character set",
"Japanese"))
1165 else if (lang==
i18nc(
"@item Text character set",
"Baltic"))
1167 else if (lang==
i18nc(
"@item Text character set",
"Arabic"))
1212 return i18nc(
"@item Text character set",
"Arabic");
1215 return i18nc(
"@item Text character set",
"Baltic");
1218 return i18nc(
"@item Text character set",
"Central European");
1221 return i18nc(
"@item Text character set",
"Cyrillic");
1224 return i18nc(
"@item Text character set",
"Greek");
1227 return i18nc(
"@item Text character set",
"Hebrew");
1230 return i18nc(
"@item Text character set",
"Japanese");
1233 return i18nc(
"@item Text character set",
"Turkish");
1236 return i18nc(
"@item Text character set",
"Western European");
1239 return i18nc(
"@item Text character set",
"Chinese Traditional");
1242 return i18nc(
"@item Text character set",
"Chinese Simplified");
1245 return i18nc(
"@item Text character set",
"Korean");
1248 return i18nc(
"@item Text character set",
"Thai");
1251 return i18nc(
"@item Text character set",
"Unicode");
QTextCodec * codecForName(const QString &name) const
Provided for compatibility.
static AutoDetectScript scriptForName(const QString &lang)
Takes lang name after it were i18n()'ed.
QString decodeWithBuffering(const char *data, int len)
Convenience method that uses buffering.
const char * encoding() const
Convenience method.
bool errorsIfUtf8(const char *data, int length)
Check if we are really utf8.
bool visuallyOrdered() const
QString flush()
Convenience method to be used with decodeForHtml.
static QString nameForScript(AutoDetectScript)
KEncodingDetector()
Default codec is latin1 (as html spec says), EncodingChoiceSource is default, AutoDetectScript=Semiau...
QString decode(const char *data, int len)
The main class method.
static bool hasAutoDetectionForScript(AutoDetectScript)
void setAutoDetectLanguage(AutoDetectScript)
bool decodedInvalidCharacters() const
This method checks whether invalid characters were found during a decoding operation.
bool analyze(const char *data, int len)
Analyze text data.
void resetDecoder()
Resets the decoder.
EncodingChoiceSource encodingChoiceSource() const
bool processNull(char *data, int length)
This nice method will kill all 0 bytes (or double bytes) and remember if this was a binary or not ;)
bool setEncoding(const char *encoding, EncodingChoiceSource type)
AutoDetectScript autoDetectLanguage() const
enum Type guess_jp(const char *buf, int buflen)
static QByteArray automaticDetectionForHebrew(const unsigned char *ptr, int size)
static void skipComment(const char *&ptr, const char *pEnd)
static bool is16Bit(QTextCodec *codec)
static QByteArray automaticDetectionForBaltic(const unsigned char *ptr, int size)
static QByteArray automaticDetectionForCyrillic(const unsigned char *ptr, int size)
static QByteArray automaticDetectionForGreek(const unsigned char *ptr, int size)
static QByteArray automaticDetectionForCentralEuropean(const unsigned char *ptr, int size)
static QByteArray automaticDetectionForJapanese(const unsigned char *ptr, int size)
static QByteArray automaticDetectionForTurkish(const unsigned char *ptr, int size)
static QByteArray automaticDetectionForArabic(const unsigned char *ptr, int size)
static int findXMLEncoding(const QByteArray &str, int &encodingLength)
static QByteArray automaticDetectionForWesternEuropean(const unsigned char *ptr, int size)
QString i18nc(const char *ctxt, const char *text)
Returns a localized version of a string and a context.
KCharsets * charsets()
The global charset manager.