mirror of
https://github.com/Xevion/easy7zip.git
synced 2025-12-12 20:11:34 -06:00
Update to 7-Zip Version 21.02
This commit is contained in:
@@ -3,10 +3,382 @@
|
||||
#ifndef __COMMON_UTF_CONVERT_H
|
||||
#define __COMMON_UTF_CONVERT_H
|
||||
|
||||
#include "MyBuffer.h"
|
||||
#include "MyString.h"
|
||||
|
||||
bool CheckUTF8(const char *src, bool allowReduced = false) throw();
|
||||
bool ConvertUTF8ToUnicode(const AString &utfString, UString &resultString);
|
||||
void ConvertUnicodeToUTF8(const UString &unicodeString, AString &resultString);
|
||||
struct CUtf8Check
|
||||
{
|
||||
// Byte MaxByte; // in original src stream
|
||||
bool NonUtf;
|
||||
bool ZeroChar;
|
||||
bool SingleSurrogate;
|
||||
bool Escape;
|
||||
bool Truncated;
|
||||
UInt32 MaxHighPoint; // only for points >= 0x80
|
||||
|
||||
CUtf8Check() { Clear(); }
|
||||
|
||||
void Clear()
|
||||
{
|
||||
// MaxByte = 0;
|
||||
NonUtf = false;
|
||||
ZeroChar = false;
|
||||
SingleSurrogate = false;
|
||||
Escape = false;
|
||||
Truncated = false;
|
||||
MaxHighPoint = 0;
|
||||
}
|
||||
|
||||
void Update(const CUtf8Check &c)
|
||||
{
|
||||
if (c.NonUtf) NonUtf = true;
|
||||
if (c.ZeroChar) ZeroChar = true;
|
||||
if (c.SingleSurrogate) SingleSurrogate = true;
|
||||
if (c.Escape) Escape = true;
|
||||
if (c.Truncated) Truncated = true;
|
||||
if (MaxHighPoint < c.MaxHighPoint) MaxHighPoint = c.MaxHighPoint;
|
||||
}
|
||||
|
||||
void PrintStatus(AString &s) const
|
||||
{
|
||||
s.Empty();
|
||||
|
||||
// s.Add_OptSpaced("MaxByte=");
|
||||
// s.Add_UInt32(MaxByte);
|
||||
|
||||
if (NonUtf) s.Add_OptSpaced("non-UTF8");
|
||||
if (ZeroChar) s.Add_OptSpaced("ZeroChar");
|
||||
if (SingleSurrogate) s.Add_OptSpaced("SingleSurrogate");
|
||||
if (Escape) s.Add_OptSpaced("Escape");
|
||||
if (Truncated) s.Add_OptSpaced("Truncated");
|
||||
|
||||
if (MaxHighPoint != 0)
|
||||
{
|
||||
s.Add_OptSpaced("MaxUnicode=");
|
||||
s.Add_UInt32(MaxHighPoint);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool IsOK(bool allowReduced = false) const
|
||||
{
|
||||
if (NonUtf || SingleSurrogate || ZeroChar)
|
||||
return false;
|
||||
if (MaxHighPoint >= 0x110000)
|
||||
return false;
|
||||
if (Truncated && !allowReduced)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// it checks full buffer as specified in (size) and it doesn't stop on zero char
|
||||
void Check_Buf(const char *src, size_t size) throw();
|
||||
|
||||
void Check_AString(const AString &s) throw()
|
||||
{
|
||||
Check_Buf(s.Ptr(), s.Len());
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
if (allowReduced == false) - all UTF-8 character sequences must be finished.
|
||||
if (allowReduced == true) - it allows truncated last character-Utf8-sequence
|
||||
*/
|
||||
|
||||
bool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw();
|
||||
bool CheckUTF8_AString(const AString &s) throw();
|
||||
|
||||
#define UTF_FLAG__FROM_UTF8__SURROGATE_ERROR (1 << 0)
|
||||
#define UTF_FLAG__FROM_UTF8__USE_ESCAPE (1 << 1)
|
||||
#define UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT (1 << 2)
|
||||
|
||||
/*
|
||||
UTF_FLAG__FROM_UTF8__SURROGATE_ERROR
|
||||
|
||||
if (flag is NOT set)
|
||||
{
|
||||
it processes SINGLE-SURROGATE-8 as valid Unicode point.
|
||||
it converts SINGLE-SURROGATE-8 to SINGLE-SURROGATE-16
|
||||
Note: some sequencies of two SINGLE-SURROGATE-8 points
|
||||
will generate correct SURROGATE-16-PAIR, and
|
||||
that SURROGATE-16-PAIR later will be converted to correct
|
||||
UTF8-SURROGATE-21 point. So we don't restore original
|
||||
STR-8 sequence in that case.
|
||||
}
|
||||
|
||||
if (flag is set)
|
||||
{
|
||||
if (UTF_FLAG__FROM_UTF8__USE_ESCAPE is defined)
|
||||
it generates ESCAPE for SINGLE-SURROGATE-8,
|
||||
if (UTF_FLAG__FROM_UTF8__USE_ESCAPE is not defined)
|
||||
it generates U+fffd for SINGLE-SURROGATE-8,
|
||||
}
|
||||
|
||||
|
||||
UTF_FLAG__FROM_UTF8__USE_ESCAPE
|
||||
|
||||
if (flag is NOT set)
|
||||
it generates (U+fffd) code for non-UTF-8 (invalid) characters
|
||||
|
||||
if (flag is set)
|
||||
{
|
||||
It generates (ESCAPE) codes for NON-UTF-8 (invalid) characters.
|
||||
And later we can restore original UTF-8-RAW characters from (ESCAPE-16-21) codes.
|
||||
}
|
||||
|
||||
UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT
|
||||
|
||||
if (flag is NOT set)
|
||||
{
|
||||
it process ESCAPE-8 points as another Unicode points.
|
||||
In Linux: ESCAPE-16 will mean two different ESCAPE-8 seqences,
|
||||
so we need HIGH-ESCAPE-PLANE-21 to restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW
|
||||
}
|
||||
|
||||
if (flag is set)
|
||||
{
|
||||
it generates ESCAPE-16-21 for ESCAPE-8 points
|
||||
so we can restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW without HIGH-ESCAPE-PLANE-21.
|
||||
}
|
||||
|
||||
|
||||
Main USE CASES with UTF-8 <-> UTF-16 conversions:
|
||||
|
||||
WIN32: UTF-16-RAW -> UTF-8 (Archive) -> UTF-16-RAW
|
||||
{
|
||||
set UTF_FLAG__FROM_UTF8__USE_ESCAPE
|
||||
Do NOT set UTF_FLAG__FROM_UTF8__SURROGATE_ERROR
|
||||
Do NOT set UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT
|
||||
|
||||
So we restore original SINGLE-SURROGATE-16 from single SINGLE-SURROGATE-8.
|
||||
}
|
||||
|
||||
Linux: UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW
|
||||
{
|
||||
we want restore original UTF-8-RAW sequence later from that ESCAPE-16.
|
||||
Set the flags:
|
||||
UTF_FLAG__FROM_UTF8__SURROGATE_ERROR
|
||||
UTF_FLAG__FROM_UTF8__USE_ESCAPE
|
||||
UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT
|
||||
}
|
||||
|
||||
MacOS: UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW
|
||||
{
|
||||
we want to restore correct UTF-8 without any BMP processing:
|
||||
Set the flags:
|
||||
UTF_FLAG__FROM_UTF8__SURROGATE_ERROR
|
||||
UTF_FLAG__FROM_UTF8__USE_ESCAPE
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
// zero char is not allowed in (src) buf
|
||||
bool Convert_UTF8_Buf_To_Unicode(const char *src, size_t srcSize, UString &dest, unsigned flags = 0);
|
||||
|
||||
bool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags = 0);
|
||||
bool ConvertUTF8ToUnicode(const AString &src, UString &dest);
|
||||
|
||||
#define UTF_FLAG__TO_UTF8__SURROGATE_ERROR (1 << 8)
|
||||
#define UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE (1 << 9)
|
||||
// #define UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE (1 << 10)
|
||||
|
||||
/*
|
||||
UTF_FLAG__TO_UTF8__SURROGATE_ERROR
|
||||
|
||||
if (flag is NOT set)
|
||||
{
|
||||
we extract SINGLE-SURROGATE as normal UTF-8
|
||||
|
||||
In Windows : for UTF-16-RAW <-> UTF-8 (archive) <-> UTF-16-RAW in .
|
||||
|
||||
In Linux :
|
||||
use-case-1: UTF-8 -> UTF-16 -> UTF-8 doesn't generate UTF-16 SINGLE-SURROGATE,
|
||||
if (UTF_FLAG__FROM_UTF8__SURROGATE_ERROR) is used.
|
||||
use-case 2: UTF-16-7z (with SINGLE-SURROGATE from Windows) -> UTF-8 (Linux)
|
||||
will generate SINGLE-SURROGATE-UTF-8 here.
|
||||
}
|
||||
|
||||
if (flag is set)
|
||||
{
|
||||
we generate UTF_REPLACEMENT_CHAR (0xfffd) for SINGLE_SURROGATE
|
||||
it can be used for compatibility mode with WIN32 UTF function
|
||||
or if we want UTF-8 stream without any errors
|
||||
}
|
||||
|
||||
|
||||
UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE
|
||||
|
||||
if (flag is NOT set) it doesn't extract raw 8-bit symbol from Escape-Plane-16
|
||||
if (flag is set) it extracts raw 8-bit symbol from Escape-Plane-16
|
||||
|
||||
in Linux we need some way to extract NON-UTF8 RAW 8-bits from BMP (UTF-16 7z archive):
|
||||
if (we use High-Escape-Plane), we can transfer BMP escapes to High-Escape-Plane.
|
||||
if (we don't use High-Escape-Plane), we must use UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE.
|
||||
|
||||
|
||||
UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE
|
||||
// that flag affects the code only if (wchar_t is 32-bit)
|
||||
// that mode with high-escape can be disabled now in UTFConvert.cpp
|
||||
if (flag is NOT set)
|
||||
it doesn't extract raw 8-bit symbol from High-Escape-Plane
|
||||
if (flag is set)
|
||||
it extracts raw 8-bit symbol from High-Escape-Plane
|
||||
|
||||
Main use cases:
|
||||
|
||||
WIN32 : UTF-16-RAW -> UTF-8 (archive) -> UTF-16-RAW
|
||||
{
|
||||
Do NOT set UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE.
|
||||
Do NOT set UTF_FLAG__TO_UTF8__SURROGATE_ERROR.
|
||||
So we restore original UTF-16-RAW.
|
||||
}
|
||||
|
||||
Linix : UTF-8 with Escapes -> UTF-16 (7z archive) -> UTF-8 with Escapes
|
||||
set UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE to extract non-UTF from 7z archive
|
||||
set UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE for intermediate UTF-16.
|
||||
Note: high esacape mode can be ignored now in UTFConvert.cpp
|
||||
|
||||
macOS:
|
||||
the system doesn't support incorrect UTF-8 in file names.
|
||||
set UTF_FLAG__TO_UTF8__SURROGATE_ERROR
|
||||
*/
|
||||
|
||||
extern unsigned g_Unicode_To_UTF8_Flags;
|
||||
|
||||
void ConvertUnicodeToUTF8_Flags(const UString &src, AString &dest, unsigned flags = 0);
|
||||
void ConvertUnicodeToUTF8(const UString &src, AString &dest);
|
||||
|
||||
void Convert_Unicode_To_UTF8_Buf(const UString &src, CByteBuffer &dest);
|
||||
|
||||
/*
|
||||
#ifndef _WIN32
|
||||
void Convert_UTF16_To_UTF32(const UString &src, UString &dest);
|
||||
void Convert_UTF32_To_UTF16(const UString &src, UString &dest);
|
||||
bool UTF32_IsThere_BigPoint(const UString &src);
|
||||
bool Unicode_IsThere_BmpEscape(const UString &src);
|
||||
#endif
|
||||
|
||||
bool Unicode_IsThere_Utf16SurrogateError(const UString &src);
|
||||
*/
|
||||
|
||||
#ifdef _WCHART_IS_16BIT
|
||||
#define Convert_UnicodeEsc16_To_UnicodeEscHigh(s)
|
||||
#else
|
||||
void Convert_UnicodeEsc16_To_UnicodeEscHigh(UString &s);
|
||||
#endif
|
||||
|
||||
/*
|
||||
// #include "../../C/CpuArch.h"
|
||||
|
||||
// ---------- Utf16 Little endian functions ----------
|
||||
|
||||
// We store 16-bit surrogates even in 32-bit WCHARs in Linux.
|
||||
// So now we don't use the following code:
|
||||
|
||||
#if WCHAR_MAX > 0xffff
|
||||
|
||||
// void *p : pointer to src bytes stream
|
||||
// size_t len : num Utf16 characters : it can include or not include NULL character
|
||||
|
||||
inline size_t Utf16LE__Get_Num_WCHARs(const void *p, size_t len)
|
||||
{
|
||||
#if WCHAR_MAX > 0xffff
|
||||
size_t num_wchars = 0;
|
||||
for (size_t i = 0; i < len; i++)
|
||||
{
|
||||
wchar_t c = GetUi16(p);
|
||||
p = (const void *)((const Byte *)p + 2);
|
||||
if (c >= 0xd800 && c < 0xdc00 && i + 1 != len)
|
||||
{
|
||||
wchar_t c2 = GetUi16(p);
|
||||
if (c2 >= 0xdc00 && c2 < 0xe000)
|
||||
{
|
||||
c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
|
||||
p = (const void *)((const Byte *)p + 2);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
num_wchars++;
|
||||
}
|
||||
return num_wchars;
|
||||
#else
|
||||
UNUSED_VAR(p)
|
||||
return len;
|
||||
#endif
|
||||
}
|
||||
|
||||
// #include <stdio.h>
|
||||
|
||||
inline wchar_t *Utf16LE__To_WCHARs_Sep(const void *p, size_t len, wchar_t *dest)
|
||||
{
|
||||
for (size_t i = 0; i < len; i++)
|
||||
{
|
||||
wchar_t c = GetUi16(p);
|
||||
p = (const void *)((const Byte *)p + 2);
|
||||
|
||||
#if WCHAR_PATH_SEPARATOR != L'/'
|
||||
if (c == L'/')
|
||||
c = WCHAR_PATH_SEPARATOR;
|
||||
#endif
|
||||
|
||||
#if WCHAR_MAX > 0xffff
|
||||
|
||||
if (c >= 0xd800 && c < 0xdc00 && i + 1 != len)
|
||||
{
|
||||
wchar_t c2 = GetUi16(p);
|
||||
if (c2 >= 0xdc00 && c2 < 0xe000)
|
||||
{
|
||||
// printf("\nSurragate : %4x %4x -> ", (int)c, (int)c2);
|
||||
c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
|
||||
p = (const void *)((const Byte *)p + 2);
|
||||
i++;
|
||||
// printf("%4x\n", (int)c);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
*dest++ = c;
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
inline size_t Get_Num_Utf16_chars_from_wchar_string(const wchar_t *p)
|
||||
{
|
||||
size_t num = 0;
|
||||
for (;;)
|
||||
{
|
||||
wchar_t c = *p++;
|
||||
if (c == 0)
|
||||
return num;
|
||||
num += ((c >= 0x10000 && c < 0x110000) ? 2 : 1);
|
||||
}
|
||||
return num;
|
||||
}
|
||||
|
||||
inline Byte *wchars_to_Utf16LE(const wchar_t *p, Byte *dest)
|
||||
{
|
||||
for (;;)
|
||||
{
|
||||
wchar_t c = *p++;
|
||||
if (c == 0)
|
||||
return dest;
|
||||
if (c >= 0x10000 && c < 0x110000)
|
||||
{
|
||||
SetUi16(dest , (UInt16)(0xd800 + ((c >> 10) & 0x3FF)));
|
||||
SetUi16(dest + 2, (UInt16)(0xdc00 + ( c & 0x3FF)));
|
||||
dest += 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
SetUi16(dest, c);
|
||||
dest += 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user