Skip to content
Open
74 changes: 65 additions & 9 deletions lib/IMAP/Charset/Converter.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,40 @@

use Horde_Mime_Part;
use OCA\Mail\Exception\ServiceException;
use ValueError;
use function in_array;
use function is_string;

class Converter {

/**
* Map of unsupported charset names to their mbstring equivalents.
* Keys must be lowercase for case-insensitive lookup.
*
* @see http://lists.w3.org/Archives/Public/ietf-charsets/2001AprJun/0030.html
*/
private const CHARSET_MAP = [
'ks_c_5601-1987' => 'UHC',
'ks_c_5601-1989' => 'UHC',
];

/**
* Normalize charset names for mbstring compatibility.
*
* Maps unsupported charset names to their mbstring equivalents.
* Notably, handles Korean encodings used by Outlook:
* - ks_c_5601-1987 and ks_c_5601-1989 are mapped to UHC (Windows-949/CP949)
*
* Charset tokens are case-insensitive in email headers (RFC 2045),
* so we normalize to lowercase for lookup.
*/
private function normalizeCharset(string $charset): string {
$charset = trim($charset);
$lowerCharset = strtolower($charset);

return self::CHARSET_MAP[$lowerCharset] ?? $charset;
}

/**
* @param Horde_Mime_Part $p
* @return string
Expand All @@ -37,10 +66,17 @@

// The part specifies a charset
if ($charset !== null) {
if (in_array($charset, mb_list_encodings(), true)) {
$converted = mb_convert_encoding($data, 'UTF-8', $charset);
} else {
$converted = iconv($charset, 'UTF-8', $data);
$normalizedCharset = $this->normalizeCharset($charset);
try {
if (in_array($normalizedCharset, mb_list_encodings(), true)) {
$converted = mb_convert_encoding($data, 'UTF-8', $normalizedCharset);
} else {
$converted = @iconv($normalizedCharset, 'UTF-8', $data);
}
} catch (ValueError) {
// Invalid charset name, treat as null to use auto-detection below
$charset = null;
$converted = null;
}
Comment on lines +75 to 80
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After catching ValueError for an invalid charset name, the code comment says it should “fall through to auto-detection”, but later $normalizedCharset is recomputed from the original (still-invalid) $charset and passed to mb_convert_encoding()/iconv() again. This prevents the intended autodetect fallback and guarantees a ServiceException for invalid charset tokens even when mbstring could detect/convert the bytes. Consider treating an invalid/empty charset as null (or setting a flag) so the later conversion uses autodetection instead of retrying the invalid name.

Copilot uses AI. Check for mistakes.

if (is_string($converted)) {
Expand All @@ -51,22 +87,42 @@
// No charset specified, let's ask mb if this could be UTF-8
$detectedCharset = mb_detect_encoding($data, 'UTF-8', true);
if ($detectedCharset === false) {
// Fallback, non UTF-8
$detectedCharset = mb_detect_encoding($data, null, true);
// Fallback, try common charsets (the default mb_detect_encoding order may miss some)
$detectedCharset = mb_detect_encoding($data, 'ISO-8859-1,ISO-8859-2,UTF-8,ASCII', true);
}
// Still UTF8, no need to convert
if ($detectedCharset !== false && strtoupper($detectedCharset) === 'UTF-8') {
return $data;
}

$converted = @mb_convert_encoding($data, 'UTF-8', $charset);
// Use detected charset when available, otherwise use original/normalized charset
if ($detectedCharset !== false) {
$sourceCharset = $detectedCharset;
} elseif ($charset !== null) {
$sourceCharset = $this->normalizeCharset($charset);
} else {
$sourceCharset = null;
}

// Attempt conversion with the source charset
try {
$converted = @mb_convert_encoding($data, 'UTF-8', $sourceCharset);
} catch (ValueError) {
$converted = false;
}

if ($converted === false) {
// Might be a charset that PHP mb doesn't know how to handle, fall back to iconv
$converted = iconv($charset, 'UTF-8', $data);
try {
$converted = @iconv($sourceCharset, 'UTF-8', $data);

Check failure on line 117 in lib/IMAP/Charset/Converter.php

View workflow job for this annotation

GitHub Actions / static-psalm-analysis dev-stable32

PossiblyNullArgument

lib/IMAP/Charset/Converter.php:117:25: PossiblyNullArgument: Argument 1 of iconv cannot be null, possibly null value provided (see https://psalm.dev/078)

Check failure on line 117 in lib/IMAP/Charset/Converter.php

View workflow job for this annotation

GitHub Actions / static-psalm-analysis dev-master

PossiblyNullArgument

lib/IMAP/Charset/Converter.php:117:25: PossiblyNullArgument: Argument 1 of iconv cannot be null, possibly null value provided (see https://psalm.dev/078)

Check failure on line 117 in lib/IMAP/Charset/Converter.php

View workflow job for this annotation

GitHub Actions / static-psalm-analysis dev-stable33

PossiblyNullArgument

lib/IMAP/Charset/Converter.php:117:25: PossiblyNullArgument: Argument 1 of iconv cannot be null, possibly null value provided (see https://psalm.dev/078)
} catch (ValueError) {
// Invalid charset, conversion not possible
$converted = null;
}
}

if (!is_string($converted)) {
throw new ServiceException('Could not detect message charset');
throw new ServiceException('Could not convert message charset');
}
return $converted;
}
Expand Down
86 changes: 78 additions & 8 deletions tests/Unit/IMAP/Charset/ConverterTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,28 @@ public function dataProviderMimeParts(): array {
$iso2022jpMimePart->setType('text/plain');
$iso2022jpMimePart->setCharset('ISO-2022-JP');
$iso2022jpMimePart->setContents(mb_convert_encoding('外せ園査リツハワ題', 'ISO-2022-JP', 'UTF-8'));
$iso2022jpMimePart_noCharset = new Horde_Mime_Part();
$iso2022jpMimePart_noCharset->setContents('外せ園査リツハワ題');
// Korean - not in mb nor iconv
// $iso106461MimePart = new Horde_Mime_Part();
// $iso106461MimePart->setType('text/plain');
// $iso106461MimePart->setCharset('ISO 10646-1');
//$iso106461MimePart->setContents(iconv('UTF-8', 'ISO 10646-1', '언론·출판은 타인의 명'));
// Korean (Outlook) - ks_c_5601-1987 is mapped to UHC (CP949)
// Use iconv for encoding to avoid dependency on mbstring's UHC support
$koreanKsc56011987MimePart = new Horde_Mime_Part();
$koreanKsc56011987MimePart->setType('text/plain');
$koreanKsc56011987MimePart->setCharset('ks_c_5601-1987');
$koreanText = '안녕하세요';
$koreanKsc56011987MimePart->setContents(iconv('UTF-8', 'CP949', $koreanText));
// Korean (Outlook) - ks_c_5601-1989 is also mapped to UHC (CP949)
$koreanKsc56011989MimePart = new Horde_Mime_Part();
$koreanKsc56011989MimePart->setType('text/plain');
$koreanKsc56011989MimePart->setCharset('ks_c_5601-1989');
$koreanKsc56011989MimePart->setContents(iconv('UTF-8', 'CP949', $koreanText));
// Korean (Outlook) - uppercase variant KS_C_5601-1987 (case-insensitive)
$koreanKsc56011987UpperMimePart = new Horde_Mime_Part();
$koreanKsc56011987UpperMimePart->setType('text/plain');
$koreanKsc56011987UpperMimePart->setCharset('KS_C_5601-1987');
$koreanKsc56011987UpperMimePart->setContents(iconv('UTF-8', 'CP949', $koreanText));
// Korean (Outlook) - mixed case variant Ks_C_5601-1987 (case-insensitive)
$koreanKsc56011987MixedMimePart = new Horde_Mime_Part();
$koreanKsc56011987MixedMimePart->setType('text/plain');
$koreanKsc56011987MixedMimePart->setCharset('Ks_C_5601-1987');
$koreanKsc56011987MixedMimePart->setContents(iconv('UTF-8', 'CP949', $koreanText));
// Arabic - not in mb
$windowsMimePart = new Horde_Mime_Part();
$windowsMimePart->setType('text/plain');
Expand All @@ -79,8 +94,63 @@ public function dataProviderMimeParts(): array {
[$iso88591MimePart, 'Ümlaut'],
[$iso2022jpMimePart, '外せ園査リツハワ題'],
[$iso88591MimePart_noCharset, 'בה בדף לחבר ממונרכיה, בקר בגרסה ואמנות דת'],
// [$iso106461MimePart, '언론·출판은 타인의 명'],
[$koreanKsc56011987MimePart, $koreanText],
[$koreanKsc56011989MimePart, $koreanText],
[$koreanKsc56011987UpperMimePart, $koreanText],
[$koreanKsc56011987MixedMimePart, $koreanText],
[$windowsMimePart, 'قام زهاء أوراقهم ما,']
];
}

/**
* Test that conversion succeeds when no charset is specified in the MIME header.
*
* This tests the code path where $charset is null. The Converter should:
* 1. Use mb_detect_encoding() to detect the source encoding
* 2. Use the detected charset for conversion to UTF-8
*
* Without detection, conversion would fail or produce garbled output.
*/
public function testConvertWithNullCharsetFallback(): void {
// Create a mock that returns null for getCharset() to test the null charset path
$mimePart = $this->createMock(Horde_Mime_Part::class);
$mimePart->method('getContents')
->willReturn(mb_convert_encoding('Tëst', 'ISO-8859-1', 'UTF-8'));
$mimePart->method('getCharset')
->willReturn(null);

// Should complete without ValueError and return the correctly converted text
$result = $this->converter->convert($mimePart);

// Verify actual conversion correctness, not just UTF-8 validity
$this->assertEquals('Tëst', $result);
// Also verify it's valid UTF-8
$this->assertTrue(mb_check_encoding($result, 'UTF-8'));
}

/**
* Test that an invalid/unknown charset name does not let ValueError bubble up.
*
* When an invalid charset is provided, Converter catches the ValueError
* and falls back to mbstring auto-detection. The result depends on
* mb_detect_order, but the important behavior is that no ValueError escapes.
*/
public function testConvertWithInvalidCharsetDoesNotThrowValueError(): void {
$mimePart = $this->createMock(Horde_Mime_Part::class);
$mimePart->method('getContents')
->willReturn(mb_convert_encoding('Tëst with spëcial chärs', 'ISO-8859-1', 'UTF-8'));
$mimePart->method('getCharset')
->willReturn('INVALID-CHARSET-NAME-12345');

$thrown = null;
try {
$this->converter->convert($mimePart);
} catch (\ValueError $e) {
$thrown = $e;
} catch (\OCA\Mail\Exception\ServiceException) {
// ServiceException is acceptable (auto-detection failed)
}

$this->assertNull($thrown, 'ValueError should not bubble up from convert()');
}
}
Loading