% \iffalse meta-comment
%
% Copyright (C) 2011-2018 by Enrico Gregorio
% <Enrico dot Gregorio at univr dot it>
% -------------------------------------------------------
%
% This work may be distributed and/or modified under the
% conditions of the LaTeX Project Public License, either
% version 1.3c of this license or (at your option) any
% later version. The latest version of this license is in
%   http://www.latex-project.org/lppl.txt
% and version 1.3 or later is part of all distributions
% of LaTeX version 2005/12/01 or later.
%
% This work has the LPPL maintenance status `maintained'.
%
% The Current Maintainer of this work is Enrico Gregorio.
%
% This work consists of the files
%   newunicodechar.dtx
%   newunicodechar.ins
% and the derived file newunicodechar.sty.
%
% \fi
%
% \iffalse
%<*driver>
\ProvidesFile{newunicodechar.dtx}
%</driver>
%<package>\NeedsTeXFormat{LaTeX2e}[2018/04/01]
%<package>\ProvidesPackage{newunicodechar}
%<*package>
    [2018/04/08 v1.2 Defining Unicode characters]
%</package>
%
%<*driver>
\documentclass{ltxdoc}
\usepackage[T1]{fontenc}
\usepackage{metalogo,booktabs,lmodern,textcomp,wasysym}
\EnableCrossrefs
\CodelineIndex
\RecordChanges
\begin{document}
  \DocInput{newunicodechar.dtx}
  \PrintChanges
  \PrintIndex
\end{document}
%</driver>
% \fi
%
% \CheckSum{201}
%
% \CharacterTable
%  {Upper-case    \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z
%   Lower-case    \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z
%   Digits        \0\1\2\3\4\5\6\7\8\9
%   Exclamation   \!     Double quote  \"     Hash (number) \#
%   Dollar        \$     Percent       \%     Ampersand     \&
%   Acute accent  \'     Left paren    \(     Right paren   \)
%   Asterisk      \*     Plus          \+     Comma         \,
%   Minus         \-     Point         \.     Solidus       \/
%   Colon         \:     Semicolon     \;     Less than     \<
%   Equals        \=     Greater than  \>     Question mark \?
%   Commercial at \@     Left bracket  \[     Backslash     \\
%   Right bracket \]     Circumflex    \^     Underscore    \_
%   Grave accent  \`     Left brace    \{     Vertical bar  \|
%   Right brace   \}     Tilde         \~}
%
% \changes{v1.2}{2018/04/08}{Now utf8 is default}
% \changes{v1.1}{2012/11/12}{Added support for \mbox{\textsf{inputenx}}}
% \changes{v1.0}{2011/02/15}{Initial version}
%
% \GetFileInfo{newunicodechar.dtx}
%
% \DoNotIndex{\newcommand,\newenvironment,\!,\@empty,\@gobble,\@gobbletwo}
% \DoNotIndex{\@ifpackageloaded,\@ifpackagewith,\@ifundefined,\@namedef}
% \DoNotIndex{\@nil,\@onlypreamble,\@tempa,\@tempb,\@tempswafalse,\def}
% \DoNotIndex{\@tempswatrue,\^,\-,\active,\begingroup,\catcode,\@car,\@cdr}
% \DoNotIndex{\edef,\else,\endgroup,\endinput,\expandafter,\fi,\if}
% \DoNotIndex{\if@tempswa,\ifcase,\ifnum,\ifx,\lccode,\let,\lowercase}
% \DoNotIndex{\MessageBreak,\next,\number,\numexpr,\or,\PackageError}
% \DoNotIndex{\PackageWarning,\PackageWarningNoLine,\strip@prefix,\@@end}
% \DoNotIndex{\relax,\space,\string,\DeclareOption,\ProcessOptions}
% \DoNotIndex{\meaning,\ifdefined,\csname,\chardef,\endcsname,\protect}
%
% \title{The \textsf{newunicodechar} package\thanks{This document
%   corresponds to \textsf{newunicodechar}~\fileversion, dated \filedate.}}
% \author{Enrico Gregorio \\ \texttt{Enrico dot Gregorio at univr dot it}}
%
% \maketitle
%
% \section{Introduction}
%
% When using Unicode input with \LaTeX{} it's not so uncommon to get an
% incomprehensible error message such as
%\begin{verbatim}
%Unicode char \u8:xxx not set up for use with LaTeX
%\end{verbatim}
% where |xxx| may be the actual character we input or a combination of
% strange characters. This happens because the \texttt{utf8} option
% given to \textsf{inputenc} or \textsf{inputenx} defines the \LaTeX{}
% meaning of many Unicode characters, but, of course, not all of them.
%
% For example, one might want to write some Latin words with prosodic
% marks, i.e., the diacritics that tell whether a vowel is long or
% short, in order to distinguish between `p\u{o}p\u{u}lus' (people)
% and `p\=op\u{u}lus' (poplar), but using the actual Unicode
% characters that make the \LaTeX{} document easier to read; look at
% the following table, where on the left the input is via the \LaTeX{}
% Internal Character Representation (LICR) and on the right it's via
% Unicode characters,
% \begin{center}
% \begin{tabular}{cc}
% \toprule
% LICR & Unicode \\
% \midrule
% \verb|p\u{o}p\u{u}lus| & \texttt{p\u{o}p\u{u}lus}\\
% \verb|p\={o}p\u{u}lus| & \texttt{p\={o}p\u{u}lus}\\
% \bottomrule
% \end{tabular}
% \end{center}
% and judge by yourselves which one is better. Unfortunately, the
% \texttt{utf8} option to \textsf{inputenc} doesn't define a meaning
% for \texttt{\u{o}}, \texttt{\={o}}, and \texttt{\u{u}}. As a matter
% of fact, only \texttt{\u{a}} and \texttt{\u{A}} are defined, as they
% are used in the Romanian language.
%
% One might resort to |\DeclareUnicodeCharacter| in the document's
% preamble, but this requires looking up at the (long) list of Unicode
% characters and jotting down the relevant numbers. For example,
% \texttt{\u{o}} is \texttt{U+014F}, so the declaration
%\begin{verbatim}
%\DeclareUnicodeCharacter{014F}{\u{o}}
%\end{verbatim}
% would do for \texttt{\u{o}}.
%
% The present package introduces a simpler interface that frees the
% user from the burden to look up in the tables: all it's needed is
% \begin{flushleft}
% |\newunicodechar{|\texttt{\u{o}}|}{\u{o}}|
% \end{flushleft}
% You are not restricted to definition like this: for example,
% \eighthnote{}~is Unicode \texttt{U+266A}, but you are not required
% to know it: if your editor can insert the character~\eighthnote, you
% may define its meaning by loading a package that provides it and say
% \begin{flushleft}
% |\usepackage{wasysym}|\\
% |\newunicodechar{|\eighthnote|}{\eighthnote}|
% \end{flushleft}
% Important note: the package will not work if the \textsf{inputenc} package
% is called with an option different from \texttt{utf8}. As of April 2018,
% this encoding is assumed by default in 8~bit \TeX{} engines when running
% \LaTeX. However one can still load \textsf{inputenc} or \textsf{inputenx}
% with the \texttt{utf8} option (not \texttt{utf8x}).
%
% A similar problem may arise even with \XeLaTeX{}. A frequently asked
% question on mailing lists or discussion groups is how to print some
% particular character in a different font than the main one of the
% document, say, for example, the Euro sign which, in some fonts, is
% horrible. The usual answer is to write something like
% \begin{flushleft}
% |\newfontfamily{\eurofont}{|\meta{some font}|}|\\
% |\catcode`|\texttt{\texteuro}|=\active|\\
% |\protected\def |\texttt{\texteuro}|{{\eurofont\char`\|^^A
% \texttt{\texteuro}|}}|
% \end{flushleft}
% which, for the average user, is somewhat scaring. With
% \textsf{newunicodechar} this may be simplified into
% \begin{flushleft}
% |\newfontfamily{\eurofont}{|\meta{some font}|}|\\
% |\newunicodechar{|\texttt{\texteuro}|}{{\eurofont\texteuro}}|
% \end{flushleft}
%
% \section{Usage}
%
% The package requires the use of a Unicode engine, i.e., \XeLaTeX{}
% or \LuaLaTeX{}, or, with (pdf)\LaTeX{}, the \textsf{inputenc} or
% \textsf{inputenx} package along with the \texttt{utf8} option. It
% won't work with the \texttt{utf8x} option that employs a completely
% different mechanism for parsing Unicode characters in
% (pdf)\LaTeX{}. It should be said that \texttt{utf8x} defines many
% more characters than \texttt{utf8}, so that the present package
% wouldn't be needed.
%
% Of course the \LaTeX{} document must be written using a Unicode
% savvy editor.
%
% \bigskip
%
% The package has only one option, \texttt{verbose}^^A
% \marginpar{\raggedleft\texttt{verbose}}^^A
% \index{verbose=\texttt{verbose} option|usage}, which is off by
% default. If the package is called by
% |\usepackage[verbose]{newunicodechar}|, then the informative message
% on the log file will show the old definition along with the warning
% about the redefinition. Unfortunately this definition usually has a
% rather cryptic format; for example, redefining \texttt{\u{a}} would
% print
%\begin{verbatim}
%Redefining Unicode character; it meant
%***  \IeC {\u a}  ***
%before your redefinition on input line 22.
%\end{verbatim}
% Call the package with this option if you are worried about what you
% are redefining, but the meaning of the Unicode character should
% correspond easily to just one LICR entry. This option does nothing
% when a Unicode engine is used, because no character is active
% initially (except for~|~|), so there should be non risk to redefine
% anything.
%
% \DescribeMacro{\newunicodechar}
% The package provides only one command, |\newunicodechar|, which must
% be called with two arguments:
% \begin{flushleft}
% |\newunicodechar{|\meta{char}|}{|\meta{code}|}|
% \end{flushleft}
% where \meta{char} is the Unicode character to which we need to give
% a meaning and \meta{code} is that meaning, that is the \LaTeX{} code
% that will be substituted to the character. Here is what's needed for
% the prosodic marks in Latin:
% \begin{flushleft}
% |%\newunicodechar{|\texttt{\u{A}}|}{\u{A}}|
% |\newunicodechar{|\texttt{\u{a}}|}{\u{a}}|\\
% |\newunicodechar{|\texttt{\u{E}}|}{\u{E}}|
% |\newunicodechar{|\texttt{\u{e}}|}{\u{e}}|\\
% |\newunicodechar{|\texttt{\u{I}}|}{\u{I}}|
% |\newunicodechar{|\texttt{\u{\i}}|}{\u{\i}}|\\
% |\newunicodechar{|\texttt{\u{O}}|}{\u{O}}|
% |\newunicodechar{|\texttt{\u{o}}|}{\u{o}}|\\
% |\newunicodechar{|\texttt{\u{U}}|}{\u{U}}|
% |\newunicodechar{|\texttt{\u{u}}|}{\u{u}}|\\
% |\newunicodechar{|\texttt{\={A}}|}{\={A}}|
% |\newunicodechar{|\texttt{\={a}}|}{\={a}}|\\
% |\newunicodechar{|\texttt{\={E}}|}{\={E}}|
% |\newunicodechar{|\texttt{\={e}}|}{\={e}}|\\
% |\newunicodechar{|\texttt{\={I}}|}{\={I}}|
% |\newunicodechar{|\texttt{\={\i}}|}{\={\i}}|\\
% |\newunicodechar{|\texttt{\={O}}|}{\={O}}|
% |\newunicodechar{|\texttt{\={o}}|}{\={o}}|\\
% |\newunicodechar{|\texttt{\={U}}|}{\={U}}|
% |\newunicodechar{|\texttt{\={u}}|}{\={u}}|
% \end{flushleft}
% The first line is commented out, because those characters are
% already defined. It doesn't hurt to give again a definition,
% \LaTeX{} will just warn about it.
%
% Caution: when used with \XeLaTeX{} or \LuaLaTeX{}, there will be no
% warning, but the standard setup doesn't define any active character.
%
% The first argument \emph{must} consist of a single Unicode
% character, but the package checks for it and raises an error
% otherwise; it will raise an error also if the first argument
% consists of a plain ASCII character: defining them is not allowed in
% (pdf)\LaTeX{} and would break almost everything in \XeLaTeX{} or
% \LuaLaTeX{}.
%
% \section{An easier way?}
%
% One could dispense with this package, since the same effect may be
% obtained in the way we have already seen in \XeLaTeX{} or \LuaLaTeX{};
% with (pdf)\LaTeX{}, calling |\newunicodechar{|\texttt{\={u}}|}{\={u}}|
% is equivalent to say
% \begin{flushleft}
% |\makeatletter|\\
% |\@namedef{u8:\detokenize{|\texttt{\={u}}|}}{\={u}}|\\
% |\makeatother|
% \end{flushleft}
%
% \StopEventually{}
%
% \section{Implementation}
%
% The usual presentation, that we repeat here for completeness.
%\begin{verbatim}
%\ProvidesFile{newunicodechar.dtx}
%\NeedsTeXFormat{LaTeX2e}[2008/04/05]
%\ProvidesPackage{newunicodechar}
%\end{verbatim}
% The date for the format has been chosen to match the last version of
% the \texttt{utf8enc.dfu} file that provides definitions for Unicode
% characters.
%
% Now the real macros. First of all we check that the typesetting
% engine is sufficiently recent to include $\varepsilon$-\TeX{}
% extensions.
%    \begin{macrocode}
\@ifundefined{eTeXversion}
  {\PackageError{newunicodechar}{LaTeX engine too old, aborting}
   {Please upgrade your TeX system}\@@end}{}
%    \end{macrocode}
%
% \subsection{Options}
% There's only one option.
%    \begin{macrocode}
\DeclareOption{verbose}{\let\nuc@verbose=T}
\ProcessOptions\relax
%    \end{macrocode}
% \subsection{Error messages}
%    \begin{macrocode}
\def\nuc@onebyteerr{\PackageError{newunicodechar}
  {ASCII character requested}
  {Only characters above U+007F may be defined; you asked
   for\MessageBreak a plain ASCII character and your definition
   has been ignored.}}
\def\nuc@emptyargerr{\PackageError{newunicodechar}
  {Empty argument}
  {You shouldn't write \protect\newunicodechar{}{...}}}
\def\nuc@invalidargerr{\PackageError{newunicodechar}
  {Invalid argument}
  {The first argument to \protect\newunicodechar\space is
   either\MessageBreak too long or an invalid sequence of bytes}}
%    \end{macrocode}
%
% \subsection{Unicode engines}
% In case we are running a Unicode engine (\texttt{xelatex} or
% \texttt{lualatex}), the definition of the main macro is easier: we
% check whether the character is above $127$ and, in this case, we
% make it active expanding to the second argument. This definition of
% the main macro will be seen only if the engine has the |^^^^|
% convention for inputting characters in hexadecimal format, so that
% |^^^^0021| is one token and |\@gobble| will eat it up, making
% |\next| equal to |\@empty|; with an eight bit engine (\texttt{latex}
% or \texttt{pdflatex}), |\@gobble| will swallow only |^^^|.
%
% In this case we define the main macro all in one swoop; first we
% check that the first argument is nonempty, then we act only if it
% consists of only one (character) token. Then if the charcode of this
% token is less than $127$ we emit an error message; otherwise we
% activate the character and define its (protected) expansion to be
% the second argument. The last action, in this case, is to allow
% |\newunicodechar| only in the preamble. Then we do |\endinput|.  If
% the engine is not Unicode savvy, everything up to the closing |\fi|
% is swallowed up.
%
% \begin{macro}{\newunicodechar}
% Here is the code for defining the Unicode engine version of the main
% macro. Notice that we try being on the safe side by not assuming any
% particular category code for~|~|, because we restore it after giving
% the definition where we need it to be active.
%    \begin{macrocode}
\begingroup
\catcode`\^=7 \catcode30=12 \catcode`\!=12 % for safety
\edef\next{\@gobble^^^^0021}
\expandafter\endgroup
\ifx\next\@empty % Start of code for Unicode engines
\chardef\nuc@atcode=\catcode`\~
\catcode`\~=\active
\def\newunicodechar#1#2{%
  \if\relax\detokenize{#1}\relax
    \nuc@emptyargerr
  \else
    \if\relax\detokenize\expandafter{\@cdr#1\@nil}\relax
      \ifnum`#1>\string"7F
        \catcode`#1=\active
        \begingroup\lccode`\~=`#1
        \lowercase{\endgroup\protected\def~}{#2}%
      \else
        \nuc@onebyteerr
      \fi
    \else
      \nuc@invalidargerr
    \fi
  \fi}
\catcode`\~=\nuc@atcode
\@onlypreamble\newunicodechar
\expandafter\endinput
\fi % End of code for Unicode engines
%    \end{macrocode}
% \end{macro}
%
% \subsection{Eight bit engines}
% From now on we can assume an eight bit engine is used; we check that
% \textsf{inputenc} or \textsf{inputenx} has been loaded with the
% right option, otherwise we define |\newunicodechar| to swallow its
% arguments, after a warning. We need to first check for
% \textsf{inputenx} because it loads \textsf{inputenc}.
% \changes{v1.2}{2018/04/08}{Now utf8 is default}
% \changes{v1.1}{2012/11/12}{Added support for \mbox{\textsf{inputenx}}}
%    \begin{macrocode}
\def\nuc@stop{\PackageWarningNoLine{newunicodechar}
  {This package only works if the document\MessageBreak
   encoding is `utf8'}%
  \let\newunicodechar\@gobbletwo\endinput}
\edef\@tempa{\detokenize{utf8}}
\edef\@tempb{\detokenize\expandafter{\inputencodingname}}
\ifx\@tempb\@tempa\else
  \nuc@stop
\fi
%    \end{macrocode}
% \begin{macro}{\newunicodechar}
% The main macro. We set the temporary switch to false and put in
% |\nuc@tempa| the first argument, but detokenized, since it consists of
% active characters. We check that it's not empty and access to its
% first token that we put into |\@tempb|. Then we call |\nuc@check|
% and if it sets the temporary switch to true, we execute the
% definition, since the first argument is a valid UTF-8 character, but
% we issue a warning if it already has a meaning. All we need to do is
% to define the macro |\csname u8:\nuc@tempa\endcsname|, because that's
% how \textsf{inputenc} acts. The part between |\ifdefined| and the
% corresponding |\fi| will be executed only with the \texttt{verbose}
% option.
%    \begin{macrocode}
\def\newunicodechar#1#2{%
  \@tempswafalse
  \edef\nuc@tempa{\detokenize{#1}}%
  \if\relax\nuc@tempa\relax
    \nuc@emptyargerr
  \else
    \edef\@tempb{\expandafter\@car\nuc@tempa\@nil}%
    \nuc@check
    \if@tempswa
      \@ifundefined{u8:\nuc@tempa}{}
        {\PackageWarning{newunicodechar}
          {Redefining Unicode character\ifdefined\nuc@verbose;
           it meant\MessageBreak
           ***\space\space\nuc@meaning\space\space***\MessageBreak
           before your redefinition\fi}}%
      \@namedef{u8:\nuc@tempa}{#2}%
    \fi
  \fi
}
%    \end{macrocode}
% \end{macro}
% The first helper macro computes the number of bytes in the first
% argument, though it appears to the user as a single character. The
% third is used for the \texttt{verbose} option.
%    \begin{macrocode}
\def\nuc@getlength#1{%
  \ifx#1\@nil
    \expandafter\relax
  \else
    +1\expandafter\nuc@getlength
  \fi}
\ifdefined\nuc@verbose
  \def\nuc@meaning{\expandafter\expandafter\expandafter
    \strip@prefix\expandafter\meaning\csname u8:\nuc@tempa\endcsname}
\fi
%    \end{macrocode}
% We select the action based on the length of the first argument; in
% case only one byte appears, the user is trying to define an ASCII
% character, which is not allowed; if the input is two, three, or four
% bytes long, we check whether the first byte has the correct form:
% its binary form must be, respectively, \texttt{110xxxxx},
% \texttt{1110xxxx}, or \texttt{11110xxx}, so not less than $192$,
% $224$, or $240$. The macro |\nuc@ch@ck| does precisely this, issuing
% an error message if the argument is invalid or else setting the
% temporary switch to true.
%    \begin{macrocode}
\def\nuc@check{%
  \ifcase\numexpr0\expandafter\nuc@getlength\nuc@tempa\@nil
    \or %0
    \nuc@onebyteerr\or %1
    \nuc@ch@ck{192}\or %2
    \nuc@ch@ck{224}\or %3
    \nuc@ch@ck{240}\else %4
    \nuc@invalidargerr
  \fi}
\def\nuc@ch@ck#1{%
  \expandafter\ifnum\expandafter`\@tempb<#1\relax
    \nuc@invalidargerr
  \else
    \@tempswatrue
  \fi
}
%    \end{macrocode}
% Finally we disallow |\newunicodechar| outside the preamble.
%    \begin{macrocode}
\@onlypreamble\newunicodechar
%    \end{macrocode}
% \Finale
\endinput