[rtems commit] dosfs: UTF-8 Support: Multibyte conversions
Sebastian Huber
sebh at rtems.org
Mon Jun 3 15:23:54 UTC 2013
Module: rtems
Branch: master
Commit: 83a4cbb69d65c46a4d7b12de28064e4f550c1428
Changeset: http://git.rtems.org/rtems/commit/?id=83a4cbb69d65c46a4d7b12de28064e4f550c1428
Author: Ralf Kirchner <ralf.kirchner at embedded-brains.de>
Date: Thu May 23 15:48:54 2013 +0200
dosfs: UTF-8 Support: Multibyte conversions
Add optional conversion methods for multibyte strings. With these
conversions which make use of iconv and utf8proc it becomes possible to
use strings from any language (Czech, Chinese, Arabian, Hebrew, Corean,
...) for file names and directory names.
NOTE: Iconv support must be activated during the build of the tool chain
for these conversion methods (options --enable-newlib-iconv
--enable-newlib-iconv-encodings=[ENCODINGS_YOU_WANT]). Alternatively
you can provide your own conversion methods.
---
cpukit/libfs/Makefile.am | 1 +
cpukit/libfs/src/dosfs/dosfs.h | 17 ++-
cpukit/libfs/src/dosfs/msdos_conv_utf8.c | 308 ++++++++++++++++++++++++++++++
cpukit/libfs/src/dosfs/msdos_misc.c | 24 ++-
4 files changed, 340 insertions(+), 10 deletions(-)
diff --git a/cpukit/libfs/Makefile.am b/cpukit/libfs/Makefile.am
index e06c8bd..58733f7 100644
--- a/cpukit/libfs/Makefile.am
+++ b/cpukit/libfs/Makefile.am
@@ -82,6 +82,7 @@ libdosfs_a_SOURCES += src/dosfs/msdos_create.c src/dosfs/msdos_dir.c \
src/dosfs/msdos_mknod.c src/dosfs/msdos_node_type.c \
src/dosfs/msdos_rmnod.c src/dosfs/msdos_statvfs.c \
src/dosfs/msdos_conv_default.c \
+ src/dosfs/msdos_conv_utf8.c \
src/dosfs/msdos_conv.c src/dosfs/msdos.h src/dosfs/msdos_format.c \
src/dosfs/dosfs.h src/dosfs/msdos_rename.c
endif
diff --git a/cpukit/libfs/src/dosfs/dosfs.h b/cpukit/libfs/src/dosfs/dosfs.h
index f1c3d87..acfc143 100644
--- a/cpukit/libfs/src/dosfs/dosfs.h
+++ b/cpukit/libfs/src/dosfs/dosfs.h
@@ -206,7 +206,8 @@ typedef struct {
/**
* @brief Converter implementation for new filesystem instance.
*
- * @see rtems_dosfs_create_default_converter().
+ * @see rtems_dosfs_create_default_converter() and
+ * rtems_dosfs_create_utf8_converter().
*/
rtems_dosfs_convert_control *converter;
} rtems_dosfs_mount_options;
@@ -221,6 +222,20 @@ typedef struct {
*/
rtems_dosfs_convert_control *rtems_dosfs_create_default_converter(void);
+/**
+ * @brief Allocates and initializes a UTF-8 converter.
+ *
+ * @param[in] codepage The iconv() identification string for the used codepage.
+ *
+ * @retval NULL Something failed.
+ * @retval other Pointer to initialized converter.
+ *
+ * @see rtems_dosfs_mount_options and mount().
+ */
+rtems_dosfs_convert_control *rtems_dosfs_create_utf8_converter(
+ const char *codepage
+);
+
#define MSDOS_FMT_INFO_LEVEL_NONE (0)
#define MSDOS_FMT_INFO_LEVEL_INFO (1)
#define MSDOS_FMT_INFO_LEVEL_DETAIL (2)
diff --git a/cpukit/libfs/src/dosfs/msdos_conv_utf8.c b/cpukit/libfs/src/dosfs/msdos_conv_utf8.c
new file mode 100644
index 0000000..a80db7e
--- /dev/null
+++ b/cpukit/libfs/src/dosfs/msdos_conv_utf8.c
@@ -0,0 +1,308 @@
+/**
+ * @file
+ *
+ * @ingroup DOSFS
+ *
+ * @brief UTF-8 Converter
+ */
+
+/*
+ * Copyright (c) 2013 embedded brains GmbH. All rights reserved.
+ *
+ * embedded brains GmbH
+ * Dornierstr. 4
+ * 82178 Puchheim
+ * Germany
+ * <rtems at embedded-brains.de>
+ *
+ * The license and distribution terms for this file may be
+ * found in the file LICENSE in this distribution or at
+ * http://www.rtems.com/license/LICENSE.
+ */
+
+#include <stddef.h>
+#include <assert.h>
+#include <errno.h>
+#include <iconv.h>
+#include <rtems/dosfs.h>
+#include <utf8proc/utf8proc.h>
+#include "msdos.h"
+
+#define INVALID_ICONV_DESC ( (iconv_t) -1 )
+
+typedef struct {
+ /*
+ * This structure must be the first field, since otherwise the cast
+ * operations later in the file are invalid.
+ */
+ rtems_dosfs_convert_control super;
+
+ iconv_t desc_codepage_to_utf8;
+ iconv_t desc_utf8_to_codepage;
+ iconv_t desc_utf16_to_utf8;
+ iconv_t desc_utf8_to_utf16;
+ uint8_t buffer[MSDOS_NAME_MAX_UTF8_LFN_BYTES];
+} msdos_utf8_convert_control;
+
+static int msdos_utf8_convert_with_iconv(
+ iconv_t desc,
+ const void *src,
+ size_t src_size,
+ void *dst,
+ size_t *dst_size
+)
+{
+ int eno = 0;
+ size_t inbytes_left = src_size;
+ size_t outbytes_left = *dst_size;
+ char *inbuf = (void *) (uintptr_t) src;
+ char *outbuf = dst;
+ size_t iconv_status;
+
+ iconv_status = iconv(
+ desc,
+ &inbuf,
+ &inbytes_left,
+ &outbuf,
+ &outbytes_left
+ );
+
+ *dst_size -= outbytes_left;
+
+ if ( iconv_status > 0 ) {
+ eno = EINVAL;
+ } else if ( iconv_status < 0 ) {
+ eno = ENOMEM;
+ }
+
+ return eno;
+}
+
+static int msdos_utf8_codepage_to_utf8(
+ rtems_dosfs_convert_control *super,
+ const char *src,
+ size_t src_size,
+ uint8_t *dst,
+ size_t *dst_size
+)
+{
+ msdos_utf8_convert_control *self = (msdos_utf8_convert_control *) super;
+
+ return msdos_utf8_convert_with_iconv(
+ self->desc_codepage_to_utf8,
+ src,
+ src_size,
+ dst,
+ dst_size
+ );
+}
+
+static int msdos_utf8_utf8_to_codepage(
+ rtems_dosfs_convert_control *super,
+ const uint8_t *src,
+ size_t src_size,
+ char *dst,
+ size_t *dst_size
+)
+{
+ msdos_utf8_convert_control *self = (msdos_utf8_convert_control *) super;
+
+ return msdos_utf8_convert_with_iconv(
+ self->desc_utf8_to_codepage,
+ src,
+ src_size,
+ dst,
+ dst_size
+ );
+}
+
+static int msdos_utf8_utf16_to_utf8(
+ rtems_dosfs_convert_control *super,
+ const uint16_t *src,
+ size_t src_size,
+ uint8_t *dst,
+ size_t *dst_size
+)
+{
+ msdos_utf8_convert_control *self = (msdos_utf8_convert_control *) super;
+
+ return msdos_utf8_convert_with_iconv(
+ self->desc_utf16_to_utf8,
+ src,
+ src_size,
+ dst,
+ dst_size
+ );
+}
+
+static int msdos_utf8_utf8_to_utf16(
+ rtems_dosfs_convert_control *super,
+ const uint8_t *src,
+ size_t src_size,
+ uint16_t *dst,
+ size_t *dst_size
+)
+{
+ msdos_utf8_convert_control *self = (msdos_utf8_convert_control *) super;
+
+ return msdos_utf8_convert_with_iconv(
+ self->desc_utf8_to_utf16,
+ src,
+ src_size,
+ dst,
+ dst_size
+ );
+}
+
+static int msdos_utf8proc_errmsg_to_errno( ssize_t errcode )
+{
+ int eno = 0;
+
+
+ switch ( errcode ) {
+ case 0:
+ eno = 0;
+ break;
+ case UTF8PROC_ERROR_NOMEM:
+ eno = ENOMEM;
+ break;
+ case UTF8PROC_ERROR_OVERFLOW:
+ eno = EOVERFLOW;
+ break;
+ case UTF8PROC_ERROR_INVALIDUTF8:
+ eno = EINVAL;
+ break;
+ case UTF8PROC_ERROR_NOTASSIGNED:
+ eno = EINVAL;
+ break;
+ case UTF8PROC_ERROR_INVALIDOPTS:
+ eno = EINVAL;
+ break;
+ default:
+ eno = ENOENT;
+ break;
+ }
+
+ return eno;
+}
+
+static int msdos_utf8_normalize_and_fold(
+ rtems_dosfs_convert_control *super,
+ const uint8_t *src,
+ const size_t src_size,
+ uint8_t *dst,
+ size_t *dst_size
+)
+{
+ int eno = 0;
+ int32_t *unicode_buf = (int32_t *) dst;
+ ssize_t unicode_buf_size = *dst_size / sizeof( *unicode_buf );
+ ssize_t unicodes_to_reencode;
+ ssize_t result;
+
+ (void) super;
+
+ result = utf8proc_decompose(
+ src,
+ (ssize_t) src_size,
+ unicode_buf,
+ unicode_buf_size,
+ UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_CASEFOLD
+ );
+
+ if ( result >= 0 ) {
+ if ( result < unicode_buf_size ) {
+ unicodes_to_reencode = result;
+ } else {
+ unicodes_to_reencode = unicode_buf_size - 1;
+ eno = ENOMEM;
+ }
+
+ result = utf8proc_reencode(
+ unicode_buf,
+ unicodes_to_reencode,
+ UTF8PROC_STABLE | UTF8PROC_DECOMPOSE
+ );
+
+ if ( result >= 0 ) {
+ *dst_size = result;
+ } else {
+ eno = msdos_utf8proc_errmsg_to_errno( result );
+ }
+ } else {
+ eno = msdos_utf8proc_errmsg_to_errno( result );
+ }
+
+ return eno;
+}
+
+static void msdos_utf8_destroy(
+ rtems_dosfs_convert_control *super
+)
+{
+ msdos_utf8_convert_control *self = (msdos_utf8_convert_control *) super;
+ int rv;
+
+ if ( self->desc_utf16_to_utf8 != INVALID_ICONV_DESC ) {
+ rv = iconv_close( self->desc_utf16_to_utf8 );
+ assert( rv == 0 );
+ }
+
+ if ( self->desc_codepage_to_utf8 != INVALID_ICONV_DESC ) {
+ rv = iconv_close( self->desc_codepage_to_utf8 );
+ assert( rv == 0 );
+ }
+
+ if ( self->desc_utf8_to_codepage != INVALID_ICONV_DESC ) {
+ rv = iconv_close( self->desc_utf8_to_codepage );
+ assert( rv == 0 );
+ }
+
+ if ( self->desc_utf8_to_utf16 != INVALID_ICONV_DESC ) {
+ rv = iconv_close( self->desc_utf8_to_utf16 );
+ assert( rv == 0 );
+ }
+
+ free( self );
+}
+
+static const rtems_dosfs_convert_handler msdos_utf8_convert_handler = {
+ .utf8_to_codepage = msdos_utf8_utf8_to_codepage,
+ .codepage_to_utf8 = msdos_utf8_codepage_to_utf8,
+ .utf8_to_utf16 = msdos_utf8_utf8_to_utf16,
+ .utf16_to_utf8 = msdos_utf8_utf16_to_utf8,
+ .utf8_normalize_and_fold = msdos_utf8_normalize_and_fold,
+ .destroy = msdos_utf8_destroy
+};
+
+rtems_dosfs_convert_control *rtems_dosfs_create_utf8_converter(
+ const char *codepage
+)
+{
+ msdos_utf8_convert_control *self = malloc( sizeof( *self ) );
+
+ if ( self != NULL ) {
+ self->desc_codepage_to_utf8 = iconv_open( "UTF-8", codepage );
+ self->desc_utf8_to_codepage = iconv_open( codepage, "UTF-8" );
+ self->desc_utf16_to_utf8 = iconv_open( "UTF-8", "UTF-16LE" );
+ self->desc_utf8_to_utf16 = iconv_open( "UTF-16LE", "UTF-8" );
+
+ if (
+ self->desc_utf16_to_utf8 != INVALID_ICONV_DESC
+ && self->desc_utf8_to_codepage != INVALID_ICONV_DESC
+ && self->desc_codepage_to_utf8 != INVALID_ICONV_DESC
+ && self->desc_utf8_to_utf16 != INVALID_ICONV_DESC
+ ) {
+ rtems_dosfs_convert_control *super = &self->super;
+
+ super->handler = &msdos_utf8_convert_handler;
+ super->buffer.data = &self->buffer;
+ super->buffer.size = sizeof( self->buffer );
+ } else {
+ msdos_utf8_destroy( &self->super );
+ self = NULL;
+ }
+ }
+
+ return &self->super;
+}
diff --git a/cpukit/libfs/src/dosfs/msdos_misc.c b/cpukit/libfs/src/dosfs/msdos_misc.c
index 56b58c8..959768c 100644
--- a/cpukit/libfs/src/dosfs/msdos_misc.c
+++ b/cpukit/libfs/src/dosfs/msdos_misc.c
@@ -42,6 +42,13 @@
#include <stdio.h>
+#define MSDOS_LFN_ENTRY_SIZE \
+ (MSDOS_LFN_LEN_PER_ENTRY * MSDOS_NAME_LFN_BYTES_PER_CHAR)
+
+#define MSDOS_LFN_ENTRY_SIZE_UTF8 \
+ ((MSDOS_LFN_LEN_PER_ENTRY + 1 ) * MSDOS_NAME_LFN_BYTES_PER_CHAR \
+ * MSDOS_NAME_MAX_UTF8_BYTES_PER_CHAR)
+
/*
* External strings. Saves space this way.
*/
@@ -1019,7 +1026,7 @@ msdos_get_utf16_string_from_long_entry (
{
ssize_t chars_in_entry;
- if (buf_size >= MSDOS_LFN_LEN_PER_ENTRY * MSDOS_NAME_LFN_BYTES_PER_CHAR) {
+ if (buf_size >= MSDOS_LFN_ENTRY_SIZE) {
memcpy (&entry_string_buf[0], &entry[1], 10 );
memcpy (&entry_string_buf[5], &entry[14], 12 );
memcpy (&entry_string_buf[11], &entry[28], 4 );
@@ -1195,7 +1202,7 @@ msdos_compare_entry_against_filename (
{
ssize_t size_remaining = filename_size_remaining;
int eno = 0;
- uint8_t entry_normalized[( MSDOS_LFN_LEN_PER_ENTRY + 1 ) * MSDOS_NAME_LFN_BYTES_PER_CHAR * MSDOS_NAME_MAX_UTF8_BYTES_PER_CHAR];
+ uint8_t entry_normalized[MSDOS_LFN_ENTRY_SIZE_UTF8];
size_t bytes_in_entry_normalized = sizeof ( entry_normalized );
eno = (*converter->handler->utf8_normalize_and_fold) (
@@ -1263,7 +1270,7 @@ msdos_find_file_in_directory (
bool empty_space_found = false;
uint32_t entries_per_block = bts2rd / MSDOS_DIRECTORY_ENTRY_STRUCT_SIZE;
int lfn_entry = 0;
- uint8_t entry_utf8_normalized[(MSDOS_LFN_LEN_PER_ENTRY + 1 ) * MSDOS_NAME_LFN_BYTES_PER_CHAR * MSDOS_NAME_MAX_UTF8_BYTES_PER_CHAR/*MSDOS_ENTRY_LFN_UTF8_BYTES*/];
+ uint8_t entry_utf8_normalized[MSDOS_LFN_ENTRY_SIZE_UTF8];
size_t bytes_in_entry;
bool filename_matched = false;
ssize_t filename_size_remaining = name_len_for_compare;
@@ -1800,7 +1807,7 @@ msdos_add_file (
*MSDOS_DIR_LFN_CHECKSUM(entry) = lfn_checksum;
p = entry + 1;
- n = name_converted + (fat_entries - lfn_entry) * MSDOS_LFN_LEN_PER_ENTRY * MSDOS_NAME_LFN_BYTES_PER_CHAR;
+ n = name_converted + (fat_entries - lfn_entry) * MSDOS_LFN_ENTRY_SIZE;
#if MSDOS_FIND_PRINT
printf ("MSFS:[11] ");
@@ -1919,8 +1926,8 @@ msdos_find_name_in_fat_file (
buffer,
buffer_size);
if (name_len_for_save > 0) {
- fat_entries = (name_len_for_save -1
- + (MSDOS_LFN_LEN_PER_ENTRY * MSDOS_NAME_LFN_BYTES_PER_CHAR)) / (MSDOS_LFN_LEN_PER_ENTRY * MSDOS_NAME_LFN_BYTES_PER_CHAR);
+ fat_entries = (name_len_for_save + MSDOS_LFN_ENTRY_SIZE - 1)
+ / MSDOS_LFN_ENTRY_SIZE;
name_len_for_compare = msdos_filename_utf8_to_long_name_for_compare (
converter,
name_utf8,
@@ -1984,9 +1991,8 @@ msdos_find_name_in_fat_file (
buffer,
buffer_size);
if (name_len_for_save > 0) {
- fat_entries = (name_len_for_save -1
- + (MSDOS_LFN_LEN_PER_ENTRY * MSDOS_NAME_LFN_BYTES_PER_CHAR)) / (MSDOS_LFN_LEN_PER_ENTRY * MSDOS_NAME_LFN_BYTES_PER_CHAR);
-
+ fat_entries = (name_len_for_save + MSDOS_LFN_ENTRY_SIZE - 1)
+ / MSDOS_LFN_ENTRY_SIZE;
}
else
retval = -1;
More information about the vc
mailing list