[PATCH 5/9] dosfs: UTF-8 Support: Multibyte conversions

Ralf Kirchner ralf.kirchner at embedded-brains.de
Fri May 31 13:42:21 UTC 2013


Add optional conversion methods for multibyte strings. With these
conversions which make use of iconv and utf8proc it becomes possible to
use strings from any language (Czech, Chinese, Arabian, Hebrew, Corean,
...) for file names and directory names.

NOTE: Iconv support must be activated during the build of the tool chain
for these conversion methods (options --enable-newlib-iconv
--enable-newlib-iconv-encodings=[ENCODINGS_YOU_WANT]).  Alternatively
you can provide your own conversion methods.
---
 cpukit/libfs/Makefile.am                 |    1 +
 cpukit/libfs/src/dosfs/dosfs.h           |   17 +-
 cpukit/libfs/src/dosfs/msdos_conv_utf8.c |  303 ++++++++++++++++++++++++++++++
 3 Dateien geändert, 320 Zeilen hinzugefügt(+), 1 Zeile entfernt(-)
 create mode 100644 cpukit/libfs/src/dosfs/msdos_conv_utf8.c

diff --git a/cpukit/libfs/Makefile.am b/cpukit/libfs/Makefile.am
index e06c8bd..58733f7 100644
--- a/cpukit/libfs/Makefile.am
+++ b/cpukit/libfs/Makefile.am
@@ -82,6 +82,7 @@ libdosfs_a_SOURCES += src/dosfs/msdos_create.c src/dosfs/msdos_dir.c \
     src/dosfs/msdos_mknod.c src/dosfs/msdos_node_type.c \
     src/dosfs/msdos_rmnod.c src/dosfs/msdos_statvfs.c \
     src/dosfs/msdos_conv_default.c \
+    src/dosfs/msdos_conv_utf8.c \
     src/dosfs/msdos_conv.c src/dosfs/msdos.h src/dosfs/msdos_format.c \
     src/dosfs/dosfs.h src/dosfs/msdos_rename.c
 endif
diff --git a/cpukit/libfs/src/dosfs/dosfs.h b/cpukit/libfs/src/dosfs/dosfs.h
index c7b27df..b4e66e2 100644
--- a/cpukit/libfs/src/dosfs/dosfs.h
+++ b/cpukit/libfs/src/dosfs/dosfs.h
@@ -206,7 +206,8 @@ typedef struct {
   /**
    * @brief Converter implementation for new filesystem instance.
    *
-   * @see rtems_dosfs_create_default_converter().
+   * @see rtems_dosfs_create_default_converter() and
+   * rtems_dosfs_create_utf8_converter().
    */
   rtems_dosfs_convert_control *converter;
 } rtems_dosfs_mount_options;
@@ -221,6 +222,20 @@ typedef struct {
  */
 rtems_dosfs_convert_control *rtems_dosfs_create_default_converter(void);
 
+/**
+ * @brief Allocates and initializes a UTF-8 converter.
+ *
+ * @param[in] codepage The iconv() identification string for the used codepage.
+ *
+ * @retval NULL Something failed.
+ * @retval other Pointer to initialized converter.
+ *
+ * @see rtems_dosfs_mount_options and mount().
+ */
+rtems_dosfs_convert_control *rtems_dosfs_create_utf8_converter(
+  const char *codepage
+);
+
 #define MSDOS_FMT_INFO_LEVEL_NONE   (0)
 #define MSDOS_FMT_INFO_LEVEL_INFO   (1)
 #define MSDOS_FMT_INFO_LEVEL_DETAIL (2)
diff --git a/cpukit/libfs/src/dosfs/msdos_conv_utf8.c b/cpukit/libfs/src/dosfs/msdos_conv_utf8.c
new file mode 100644
index 0000000..601c71e
--- /dev/null
+++ b/cpukit/libfs/src/dosfs/msdos_conv_utf8.c
@@ -0,0 +1,303 @@
+/**
+ * @file
+ *
+ * @ingroup DOSFS
+ *
+ * @brief UTF-8 Converter
+ */
+
+/*
+ * Copyright (c) 2013 embedded brains GmbH.  All rights reserved.
+ *
+ *  embedded brains GmbH
+ *  Dornierstr. 4
+ *  82178 Puchheim
+ *  Germany
+ *  <rtems at embedded-brains.de>
+ *
+ * The license and distribution terms for this file may be
+ * found in the file LICENSE in this distribution or at
+ * http://www.rtems.com/license/LICENSE.
+ */
+
+#include <stddef.h>
+#include <assert.h>
+#include <errno.h>
+#include <iconv.h>
+#include <rtems/dosfs.h>
+#include <utf8proc/utf8proc.h>
+#include "msdos.h"
+
+#define INVALID_ICONV_DESC ( (iconv_t) -1 )
+
+typedef struct {
+  rtems_dosfs_convert_control super;
+  iconv_t desc_codepage_to_utf8;
+  iconv_t desc_utf8_to_codepage;
+  iconv_t desc_utf16_to_utf8;
+  iconv_t desc_utf8_to_utf16;
+  uint8_t buffer[MSDOS_NAME_MAX_UTF8_LFN_BYTES];
+} msdos_utf8_convert_control;
+
+static int msdos_utf8_convert_with_iconv(
+  iconv_t     desc,
+  const void *src,
+  size_t      src_size,
+  void       *dst,
+  size_t     *dst_size
+)
+{
+  int     eno = 0;
+  size_t  inbytes_left = src_size;
+  size_t  outbytes_left = *dst_size;
+  char   *inbuf = (void *) (uintptr_t) src;
+  char   *outbuf = dst;
+  size_t  iconv_status;
+
+  iconv_status = iconv(
+    desc,
+    &inbuf,
+    &inbytes_left,
+    &outbuf,
+    &outbytes_left
+  );
+
+  *dst_size -= outbytes_left;
+
+  if ( iconv_status > 0 ) {
+    eno = EINVAL;
+  } else if ( iconv_status < 0 ) {
+    eno = ENOMEM;
+  }
+
+  return eno;
+}
+
+static int msdos_utf8_codepage_to_utf8(
+  rtems_dosfs_convert_control *super,
+  const char                  *src,
+  size_t                       src_size,
+  uint8_t                     *dst,
+  size_t                      *dst_size
+)
+{
+  msdos_utf8_convert_control *self = (msdos_utf8_convert_control *) super;
+
+  return msdos_utf8_convert_with_iconv(
+    self->desc_codepage_to_utf8,
+    src,
+    src_size,
+    dst,
+    dst_size
+  );
+}
+
+static int msdos_utf8_utf8_to_codepage(
+  rtems_dosfs_convert_control *super,
+  const uint8_t               *src,
+  size_t                       src_size,
+  char                        *dst,
+  size_t                      *dst_size
+)
+{
+  msdos_utf8_convert_control *self = (msdos_utf8_convert_control *) super;
+
+  return msdos_utf8_convert_with_iconv(
+    self->desc_utf8_to_codepage,
+    src,
+    src_size,
+    dst,
+    dst_size
+  );
+}
+
+static int msdos_utf8_utf16_to_utf8(
+  rtems_dosfs_convert_control *super,
+  const uint16_t              *src,
+  size_t                       src_size,
+  uint8_t                     *dst,
+  size_t                      *dst_size
+)
+{
+  msdos_utf8_convert_control *self = (msdos_utf8_convert_control *) super;
+
+  return msdos_utf8_convert_with_iconv(
+    self->desc_utf16_to_utf8,
+    src,
+    src_size,
+    dst,
+    dst_size
+  );
+}
+
+static int msdos_utf8_utf8_to_utf16(
+  rtems_dosfs_convert_control *super,
+  const uint8_t               *src,
+  size_t                       src_size,
+  uint16_t                    *dst,
+  size_t                      *dst_size
+)
+{
+  msdos_utf8_convert_control *self = (msdos_utf8_convert_control *) super;
+
+  return msdos_utf8_convert_with_iconv(
+    self->desc_utf8_to_utf16,
+    src,
+    src_size,
+    dst,
+    dst_size
+  );
+}
+
+static int msdos_utf8proc_errmsg_to_errno( ssize_t errcode )
+{
+  int eno = 0;
+
+
+  switch ( errcode ) {
+    case 0:
+      eno = 0;
+      break;
+    case UTF8PROC_ERROR_NOMEM:
+      eno = ENOMEM;
+      break;
+    case UTF8PROC_ERROR_OVERFLOW:
+      eno = EOVERFLOW;
+      break;
+    case UTF8PROC_ERROR_INVALIDUTF8:
+      eno = EINVAL;
+      break;
+    case UTF8PROC_ERROR_NOTASSIGNED:
+      eno = EINVAL;
+      break;
+    case UTF8PROC_ERROR_INVALIDOPTS:
+      eno = EINVAL;
+      break;
+    default:
+      eno = ENOENT;
+      break;
+  }
+
+  return eno;
+}
+
+static int msdos_utf8_normalize_and_fold(
+  rtems_dosfs_convert_control *super,
+  const uint8_t *src,
+  const size_t   src_size,
+  uint8_t       *dst,
+  size_t        *dst_size
+)
+{
+  int      eno              = 0;
+  int32_t *unicode_buf      = (int32_t *) dst;
+  ssize_t  unicode_buf_size = *dst_size / sizeof( *unicode_buf );
+  ssize_t  unicodes_to_reencode;
+  ssize_t  result;
+
+  (void) super;
+
+  result = utf8proc_decompose(
+    src,
+    (ssize_t) src_size,
+    unicode_buf,
+    unicode_buf_size,
+    UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_CASEFOLD
+  );
+
+  if ( result >= 0 ) {
+    if ( result < unicode_buf_size ) {
+      unicodes_to_reencode = result;
+    } else {
+      unicodes_to_reencode = unicode_buf_size - 1;
+      eno = ENOMEM;
+    }
+
+    result = utf8proc_reencode(
+      unicode_buf,
+      unicodes_to_reencode,
+      UTF8PROC_STABLE | UTF8PROC_DECOMPOSE
+    );
+
+    if ( result >= 0 ) {
+      *dst_size = result;
+    } else {
+      eno = msdos_utf8proc_errmsg_to_errno( result );
+    }
+  } else {
+    eno = msdos_utf8proc_errmsg_to_errno( result );
+  }
+
+  return eno;
+}
+
+static void msdos_utf8_destroy(
+  rtems_dosfs_convert_control *super
+)
+{
+  msdos_utf8_convert_control *self = (msdos_utf8_convert_control *) super;
+  int                         rv;
+
+  if ( self->desc_utf16_to_utf8 != INVALID_ICONV_DESC ) {
+    rv = iconv_close( self->desc_utf16_to_utf8 );
+    assert( rv == 0 );
+  }
+
+  if ( self->desc_codepage_to_utf8 != INVALID_ICONV_DESC ) {
+    rv = iconv_close( self->desc_codepage_to_utf8 );
+    assert( rv == 0 );
+  }
+
+  if ( self->desc_utf8_to_codepage != INVALID_ICONV_DESC ) {
+    rv = iconv_close( self->desc_utf8_to_codepage );
+    assert( rv == 0 );
+  }
+
+  if ( self->desc_utf8_to_utf16 != INVALID_ICONV_DESC ) {
+    rv = iconv_close( self->desc_utf8_to_utf16 );
+    assert( rv == 0 );
+  }
+
+  free( self );
+}
+
+static const rtems_dosfs_convert_handler msdos_utf8_convert_handler = {
+  .utf8_to_codepage = msdos_utf8_utf8_to_codepage,
+  .codepage_to_utf8 = msdos_utf8_codepage_to_utf8,
+  .utf8_to_utf16 = msdos_utf8_utf8_to_utf16,
+  .utf16_to_utf8 = msdos_utf8_utf16_to_utf8,
+  .utf8_normalize_and_fold = msdos_utf8_normalize_and_fold,
+  .destroy = msdos_utf8_destroy
+};
+
+rtems_dosfs_convert_control *rtems_dosfs_create_utf8_converter(
+  const char *codepage
+)
+{
+  msdos_utf8_convert_control *self = malloc( sizeof( *self ) );
+
+  if ( self != NULL ) {
+    self->desc_codepage_to_utf8 = iconv_open( "UTF-8", codepage );
+    self->desc_utf8_to_codepage = iconv_open( codepage, "UTF-8" );
+    self->desc_utf16_to_utf8    = iconv_open( "UTF-8", "UTF-16LE" );
+    self->desc_utf8_to_utf16    = iconv_open( "UTF-16LE", "UTF-8" );
+
+    if (
+      self->desc_utf16_to_utf8 != INVALID_ICONV_DESC
+        && self->desc_utf8_to_codepage != INVALID_ICONV_DESC
+        && self->desc_codepage_to_utf8 != INVALID_ICONV_DESC
+        && self->desc_utf8_to_utf16 != INVALID_ICONV_DESC
+    ) {
+      rtems_dosfs_convert_control *super = &self->super;
+
+      super->handler = &msdos_utf8_convert_handler;
+      super->buffer.data = &self->buffer;
+      super->buffer.size = sizeof( self->buffer );
+    } else {
+      msdos_utf8_destroy( &self->super );
+      self = NULL;
+    }
+  }
+
+  return &self->super;
+}
-- 
1.7.10.4




More information about the devel mailing list