[rtems commit] dosfs: UTF-8 Support: Multibyte conversions

Sebastian Huber sebh at rtems.org
Mon Jun 3 15:23:54 UTC 2013


Module:    rtems
Branch:    master
Commit:    83a4cbb69d65c46a4d7b12de28064e4f550c1428
Changeset: http://git.rtems.org/rtems/commit/?id=83a4cbb69d65c46a4d7b12de28064e4f550c1428

Author:    Ralf Kirchner <ralf.kirchner at embedded-brains.de>
Date:      Thu May 23 15:48:54 2013 +0200

dosfs: UTF-8 Support: Multibyte conversions

Add optional conversion methods for multibyte strings. With these
conversions which make use of iconv and utf8proc it becomes possible to
use strings from any language (Czech, Chinese, Arabian, Hebrew, Corean,
...) for file names and directory names.

NOTE: Iconv support must be activated during the build of the tool chain
for these conversion methods (options --enable-newlib-iconv
--enable-newlib-iconv-encodings=[ENCODINGS_YOU_WANT]).  Alternatively
you can provide your own conversion methods.

---

 cpukit/libfs/Makefile.am                 |    1 +
 cpukit/libfs/src/dosfs/dosfs.h           |   17 ++-
 cpukit/libfs/src/dosfs/msdos_conv_utf8.c |  308 ++++++++++++++++++++++++++++++
 cpukit/libfs/src/dosfs/msdos_misc.c      |   24 ++-
 4 files changed, 340 insertions(+), 10 deletions(-)

diff --git a/cpukit/libfs/Makefile.am b/cpukit/libfs/Makefile.am
index e06c8bd..58733f7 100644
--- a/cpukit/libfs/Makefile.am
+++ b/cpukit/libfs/Makefile.am
@@ -82,6 +82,7 @@ libdosfs_a_SOURCES += src/dosfs/msdos_create.c src/dosfs/msdos_dir.c \
     src/dosfs/msdos_mknod.c src/dosfs/msdos_node_type.c \
     src/dosfs/msdos_rmnod.c src/dosfs/msdos_statvfs.c \
     src/dosfs/msdos_conv_default.c \
+    src/dosfs/msdos_conv_utf8.c \
     src/dosfs/msdos_conv.c src/dosfs/msdos.h src/dosfs/msdos_format.c \
     src/dosfs/dosfs.h src/dosfs/msdos_rename.c
 endif
diff --git a/cpukit/libfs/src/dosfs/dosfs.h b/cpukit/libfs/src/dosfs/dosfs.h
index f1c3d87..acfc143 100644
--- a/cpukit/libfs/src/dosfs/dosfs.h
+++ b/cpukit/libfs/src/dosfs/dosfs.h
@@ -206,7 +206,8 @@ typedef struct {
   /**
    * @brief Converter implementation for new filesystem instance.
    *
-   * @see rtems_dosfs_create_default_converter().
+   * @see rtems_dosfs_create_default_converter() and
+   * rtems_dosfs_create_utf8_converter().
    */
   rtems_dosfs_convert_control *converter;
 } rtems_dosfs_mount_options;
@@ -221,6 +222,20 @@ typedef struct {
  */
 rtems_dosfs_convert_control *rtems_dosfs_create_default_converter(void);
 
+/**
+ * @brief Allocates and initializes a UTF-8 converter.
+ *
+ * @param[in] codepage The iconv() identification string for the used codepage.
+ *
+ * @retval NULL Something failed.
+ * @retval other Pointer to initialized converter.
+ *
+ * @see rtems_dosfs_mount_options and mount().
+ */
+rtems_dosfs_convert_control *rtems_dosfs_create_utf8_converter(
+  const char *codepage
+);
+
 #define MSDOS_FMT_INFO_LEVEL_NONE   (0)
 #define MSDOS_FMT_INFO_LEVEL_INFO   (1)
 #define MSDOS_FMT_INFO_LEVEL_DETAIL (2)
diff --git a/cpukit/libfs/src/dosfs/msdos_conv_utf8.c b/cpukit/libfs/src/dosfs/msdos_conv_utf8.c
new file mode 100644
index 0000000..a80db7e
--- /dev/null
+++ b/cpukit/libfs/src/dosfs/msdos_conv_utf8.c
@@ -0,0 +1,308 @@
+/**
+ * @file
+ *
+ * @ingroup DOSFS
+ *
+ * @brief UTF-8 Converter
+ */
+
+/*
+ * Copyright (c) 2013 embedded brains GmbH.  All rights reserved.
+ *
+ *  embedded brains GmbH
+ *  Dornierstr. 4
+ *  82178 Puchheim
+ *  Germany
+ *  <rtems at embedded-brains.de>
+ *
+ * The license and distribution terms for this file may be
+ * found in the file LICENSE in this distribution or at
+ * http://www.rtems.com/license/LICENSE.
+ */
+
+#include <stddef.h>
+#include <assert.h>
+#include <errno.h>
+#include <iconv.h>
+#include <rtems/dosfs.h>
+#include <utf8proc/utf8proc.h>
+#include "msdos.h"
+
+#define INVALID_ICONV_DESC ( (iconv_t) -1 )
+
+typedef struct {
+  /*
+   * This structure must be the first field, since otherwise the cast
+   * operations later in the file are invalid.
+   */
+  rtems_dosfs_convert_control super;
+
+  iconv_t desc_codepage_to_utf8;
+  iconv_t desc_utf8_to_codepage;
+  iconv_t desc_utf16_to_utf8;
+  iconv_t desc_utf8_to_utf16;
+  uint8_t buffer[MSDOS_NAME_MAX_UTF8_LFN_BYTES];
+} msdos_utf8_convert_control;
+
+static int msdos_utf8_convert_with_iconv(
+  iconv_t     desc,
+  const void *src,
+  size_t      src_size,
+  void       *dst,
+  size_t     *dst_size
+)
+{
+  int     eno = 0;
+  size_t  inbytes_left = src_size;
+  size_t  outbytes_left = *dst_size;
+  char   *inbuf = (void *) (uintptr_t) src;
+  char   *outbuf = dst;
+  size_t  iconv_status;
+
+  iconv_status = iconv(
+    desc,
+    &inbuf,
+    &inbytes_left,
+    &outbuf,
+    &outbytes_left
+  );
+
+  *dst_size -= outbytes_left;
+
+  if ( iconv_status > 0 ) {
+    eno = EINVAL;
+  } else if ( iconv_status < 0 ) {
+    eno = ENOMEM;
+  }
+
+  return eno;
+}
+
+static int msdos_utf8_codepage_to_utf8(
+  rtems_dosfs_convert_control *super,
+  const char                  *src,
+  size_t                       src_size,
+  uint8_t                     *dst,
+  size_t                      *dst_size
+)
+{
+  msdos_utf8_convert_control *self = (msdos_utf8_convert_control *) super;
+
+  return msdos_utf8_convert_with_iconv(
+    self->desc_codepage_to_utf8,
+    src,
+    src_size,
+    dst,
+    dst_size
+  );
+}
+
+static int msdos_utf8_utf8_to_codepage(
+  rtems_dosfs_convert_control *super,
+  const uint8_t               *src,
+  size_t                       src_size,
+  char                        *dst,
+  size_t                      *dst_size
+)
+{
+  msdos_utf8_convert_control *self = (msdos_utf8_convert_control *) super;
+
+  return msdos_utf8_convert_with_iconv(
+    self->desc_utf8_to_codepage,
+    src,
+    src_size,
+    dst,
+    dst_size
+  );
+}
+
+static int msdos_utf8_utf16_to_utf8(
+  rtems_dosfs_convert_control *super,
+  const uint16_t              *src,
+  size_t                       src_size,
+  uint8_t                     *dst,
+  size_t                      *dst_size
+)
+{
+  msdos_utf8_convert_control *self = (msdos_utf8_convert_control *) super;
+
+  return msdos_utf8_convert_with_iconv(
+    self->desc_utf16_to_utf8,
+    src,
+    src_size,
+    dst,
+    dst_size
+  );
+}
+
+static int msdos_utf8_utf8_to_utf16(
+  rtems_dosfs_convert_control *super,
+  const uint8_t               *src,
+  size_t                       src_size,
+  uint16_t                    *dst,
+  size_t                      *dst_size
+)
+{
+  msdos_utf8_convert_control *self = (msdos_utf8_convert_control *) super;
+
+  return msdos_utf8_convert_with_iconv(
+    self->desc_utf8_to_utf16,
+    src,
+    src_size,
+    dst,
+    dst_size
+  );
+}
+
+static int msdos_utf8proc_errmsg_to_errno( ssize_t errcode )
+{
+  int eno = 0;
+
+
+  switch ( errcode ) {
+    case 0:
+      eno = 0;
+      break;
+    case UTF8PROC_ERROR_NOMEM:
+      eno = ENOMEM;
+      break;
+    case UTF8PROC_ERROR_OVERFLOW:
+      eno = EOVERFLOW;
+      break;
+    case UTF8PROC_ERROR_INVALIDUTF8:
+      eno = EINVAL;
+      break;
+    case UTF8PROC_ERROR_NOTASSIGNED:
+      eno = EINVAL;
+      break;
+    case UTF8PROC_ERROR_INVALIDOPTS:
+      eno = EINVAL;
+      break;
+    default:
+      eno = ENOENT;
+      break;
+  }
+
+  return eno;
+}
+
+static int msdos_utf8_normalize_and_fold(
+  rtems_dosfs_convert_control *super,
+  const uint8_t *src,
+  const size_t   src_size,
+  uint8_t       *dst,
+  size_t        *dst_size
+)
+{
+  int      eno              = 0;
+  int32_t *unicode_buf      = (int32_t *) dst;
+  ssize_t  unicode_buf_size = *dst_size / sizeof( *unicode_buf );
+  ssize_t  unicodes_to_reencode;
+  ssize_t  result;
+
+  (void) super;
+
+  result = utf8proc_decompose(
+    src,
+    (ssize_t) src_size,
+    unicode_buf,
+    unicode_buf_size,
+    UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_CASEFOLD
+  );
+
+  if ( result >= 0 ) {
+    if ( result < unicode_buf_size ) {
+      unicodes_to_reencode = result;
+    } else {
+      unicodes_to_reencode = unicode_buf_size - 1;
+      eno = ENOMEM;
+    }
+
+    result = utf8proc_reencode(
+      unicode_buf,
+      unicodes_to_reencode,
+      UTF8PROC_STABLE | UTF8PROC_DECOMPOSE
+    );
+
+    if ( result >= 0 ) {
+      *dst_size = result;
+    } else {
+      eno = msdos_utf8proc_errmsg_to_errno( result );
+    }
+  } else {
+    eno = msdos_utf8proc_errmsg_to_errno( result );
+  }
+
+  return eno;
+}
+
+static void msdos_utf8_destroy(
+  rtems_dosfs_convert_control *super
+)
+{
+  msdos_utf8_convert_control *self = (msdos_utf8_convert_control *) super;
+  int                         rv;
+
+  if ( self->desc_utf16_to_utf8 != INVALID_ICONV_DESC ) {
+    rv = iconv_close( self->desc_utf16_to_utf8 );
+    assert( rv == 0 );
+  }
+
+  if ( self->desc_codepage_to_utf8 != INVALID_ICONV_DESC ) {
+    rv = iconv_close( self->desc_codepage_to_utf8 );
+    assert( rv == 0 );
+  }
+
+  if ( self->desc_utf8_to_codepage != INVALID_ICONV_DESC ) {
+    rv = iconv_close( self->desc_utf8_to_codepage );
+    assert( rv == 0 );
+  }
+
+  if ( self->desc_utf8_to_utf16 != INVALID_ICONV_DESC ) {
+    rv = iconv_close( self->desc_utf8_to_utf16 );
+    assert( rv == 0 );
+  }
+
+  free( self );
+}
+
+static const rtems_dosfs_convert_handler msdos_utf8_convert_handler = {
+  .utf8_to_codepage = msdos_utf8_utf8_to_codepage,
+  .codepage_to_utf8 = msdos_utf8_codepage_to_utf8,
+  .utf8_to_utf16 = msdos_utf8_utf8_to_utf16,
+  .utf16_to_utf8 = msdos_utf8_utf16_to_utf8,
+  .utf8_normalize_and_fold = msdos_utf8_normalize_and_fold,
+  .destroy = msdos_utf8_destroy
+};
+
+rtems_dosfs_convert_control *rtems_dosfs_create_utf8_converter(
+  const char *codepage
+)
+{
+  msdos_utf8_convert_control *self = malloc( sizeof( *self ) );
+
+  if ( self != NULL ) {
+    self->desc_codepage_to_utf8 = iconv_open( "UTF-8", codepage );
+    self->desc_utf8_to_codepage = iconv_open( codepage, "UTF-8" );
+    self->desc_utf16_to_utf8    = iconv_open( "UTF-8", "UTF-16LE" );
+    self->desc_utf8_to_utf16    = iconv_open( "UTF-16LE", "UTF-8" );
+
+    if (
+      self->desc_utf16_to_utf8 != INVALID_ICONV_DESC
+        && self->desc_utf8_to_codepage != INVALID_ICONV_DESC
+        && self->desc_codepage_to_utf8 != INVALID_ICONV_DESC
+        && self->desc_utf8_to_utf16 != INVALID_ICONV_DESC
+    ) {
+      rtems_dosfs_convert_control *super = &self->super;
+
+      super->handler = &msdos_utf8_convert_handler;
+      super->buffer.data = &self->buffer;
+      super->buffer.size = sizeof( self->buffer );
+    } else {
+      msdos_utf8_destroy( &self->super );
+      self = NULL;
+    }
+  }
+
+  return &self->super;
+}
diff --git a/cpukit/libfs/src/dosfs/msdos_misc.c b/cpukit/libfs/src/dosfs/msdos_misc.c
index 56b58c8..959768c 100644
--- a/cpukit/libfs/src/dosfs/msdos_misc.c
+++ b/cpukit/libfs/src/dosfs/msdos_misc.c
@@ -42,6 +42,13 @@
 
 #include <stdio.h>
 
+#define MSDOS_LFN_ENTRY_SIZE \
+  (MSDOS_LFN_LEN_PER_ENTRY * MSDOS_NAME_LFN_BYTES_PER_CHAR)
+
+#define MSDOS_LFN_ENTRY_SIZE_UTF8 \
+  ((MSDOS_LFN_LEN_PER_ENTRY + 1 ) * MSDOS_NAME_LFN_BYTES_PER_CHAR \
+    * MSDOS_NAME_MAX_UTF8_BYTES_PER_CHAR)
+
 /*
  * External strings. Saves space this way.
  */
@@ -1019,7 +1026,7 @@ msdos_get_utf16_string_from_long_entry (
 {
     ssize_t chars_in_entry;
 
-    if (buf_size >= MSDOS_LFN_LEN_PER_ENTRY * MSDOS_NAME_LFN_BYTES_PER_CHAR) {
+    if (buf_size >= MSDOS_LFN_ENTRY_SIZE) {
         memcpy (&entry_string_buf[0],  &entry[1],  10 );
         memcpy (&entry_string_buf[5],  &entry[14], 12 );
         memcpy (&entry_string_buf[11], &entry[28],  4 );
@@ -1195,7 +1202,7 @@ msdos_compare_entry_against_filename (
 {
   ssize_t      size_remaining = filename_size_remaining;
   int          eno            = 0;
-  uint8_t      entry_normalized[( MSDOS_LFN_LEN_PER_ENTRY + 1 ) * MSDOS_NAME_LFN_BYTES_PER_CHAR * MSDOS_NAME_MAX_UTF8_BYTES_PER_CHAR];
+  uint8_t      entry_normalized[MSDOS_LFN_ENTRY_SIZE_UTF8];
   size_t       bytes_in_entry_normalized = sizeof ( entry_normalized );
 
   eno = (*converter->handler->utf8_normalize_and_fold) (
@@ -1263,7 +1270,7 @@ msdos_find_file_in_directory (
     bool              empty_space_found = false;
     uint32_t          entries_per_block = bts2rd / MSDOS_DIRECTORY_ENTRY_STRUCT_SIZE;
     int               lfn_entry         = 0;
-    uint8_t           entry_utf8_normalized[(MSDOS_LFN_LEN_PER_ENTRY + 1 ) * MSDOS_NAME_LFN_BYTES_PER_CHAR * MSDOS_NAME_MAX_UTF8_BYTES_PER_CHAR/*MSDOS_ENTRY_LFN_UTF8_BYTES*/];
+    uint8_t           entry_utf8_normalized[MSDOS_LFN_ENTRY_SIZE_UTF8];
     size_t            bytes_in_entry;
     bool              filename_matched  = false;
     ssize_t           filename_size_remaining = name_len_for_compare;
@@ -1800,7 +1807,7 @@ msdos_add_file (
             *MSDOS_DIR_LFN_CHECKSUM(entry) = lfn_checksum;
 
             p = entry + 1;
-            n = name_converted + (fat_entries - lfn_entry) * MSDOS_LFN_LEN_PER_ENTRY * MSDOS_NAME_LFN_BYTES_PER_CHAR;
+            n = name_converted + (fat_entries - lfn_entry) * MSDOS_LFN_ENTRY_SIZE;
 
 #if MSDOS_FIND_PRINT
             printf ("MSFS:[11] ");
@@ -1919,8 +1926,8 @@ msdos_find_name_in_fat_file (
                 buffer,
                 buffer_size);
             if (name_len_for_save > 0) {
-                fat_entries = (name_len_for_save -1
-                               + (MSDOS_LFN_LEN_PER_ENTRY * MSDOS_NAME_LFN_BYTES_PER_CHAR)) / (MSDOS_LFN_LEN_PER_ENTRY * MSDOS_NAME_LFN_BYTES_PER_CHAR);
+                fat_entries = (name_len_for_save + MSDOS_LFN_ENTRY_SIZE - 1)
+                    / MSDOS_LFN_ENTRY_SIZE;
                 name_len_for_compare = msdos_filename_utf8_to_long_name_for_compare (
                   converter,
                   name_utf8,
@@ -1984,9 +1991,8 @@ msdos_find_name_in_fat_file (
                   buffer,
                   buffer_size);
               if (name_len_for_save > 0) {
-                fat_entries = (name_len_for_save -1
-                               + (MSDOS_LFN_LEN_PER_ENTRY * MSDOS_NAME_LFN_BYTES_PER_CHAR)) / (MSDOS_LFN_LEN_PER_ENTRY * MSDOS_NAME_LFN_BYTES_PER_CHAR);
-
+                  fat_entries = (name_len_for_save + MSDOS_LFN_ENTRY_SIZE - 1)
+                    / MSDOS_LFN_ENTRY_SIZE;
               }
               else
                   retval = -1;




More information about the vc mailing list