/*  $Id$

    Part of SWI-Prolog

    Author:        Jan Wielemaker
    E-mail:        wielemak@science.uva.nl
    WWW:           http://www.swi-prolog.org
    Copyright (C): 2006, University of Amsterdam

    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License
    as published by the Free Software Foundation; either version 2
    of the License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA

    As a special exception, if you link this library with other files,
    compiled with a Free Software compiler, to produce an executable, this
    library does not by itself cause the resulting executable to be covered
    by the GNU General Public License. This exception does not however
    invalidate any other reasons why the executable file might be covered by
    the GNU General Public License.
*/

:- module(prolog_syntax_map,
	  [ main/0,
	    write_syntax_map/2		% +File, +Options
	  ]).
:- use_module(library(debug), [assertion/1]).
:- use_module(library(lists), [member/2]).
:- use_module(library(option), [option/3]).
:- use_module(library('unicode/unicode_data'), [unicode_property/2]).
:- use_module(derived_core_properties, [unicode_derived_core_property/2]).

/** <module> Generate Prolog Unicode map

Create a C structure and  access   functions  for  classification of the
characters we need  for  realising  the   Prolog  syntax.  We  keep  the
definition of the first 128  ASCII   characters.  Characters  above that
needs to be classified as

	* id_start (csymf)
	May start an identifier.

	* id_continue (csym)
	May be used anywhere in identifier

	* uppercase
	We need this to be able to distinquish variables from non-variables.

	* Separators
	We need this for classifying blank space

	* Symbols
	Characters that glue together to form symbols.  These extend the
	default Prolog symbol set: #$&*+-./:<=>?@\^`~

	* lowercase
	<not needed by Prolog>
*/

:- multifile
	user:file_search_path/2.

user:file_search_path(unicode, '.').


main :-
	write_syntax_map('../pl-umap.c', []).

last_unicode_page(LastPage) :-
	LastPage is (0x10ffff + 1) // 0x100.

		 /*******************************
		 *	     C TABLES		*
		 *******************************/

%%	write_syntax_map(+File, +Options)
%
%	Options supported are:
%
%		# first_codepage [0]
%		Code page to start
%
%		# last_codepage [last_unicode_page/1]
%		Code page to end.

write_syntax_map(File, Options) :-
	open(File, write, Out),
	call_cleanup(write_sort_map(Out, Options),
		     close(Out)).

write_sort_map(Out, Options) :-
	gen_tables(Tables, Options),
	write_header(Out, Options),
	forall((member(table(CP, Map), Tables),
		is_list(Map)),
	       write_codepage(Out, CP, Map)),
	write_map(Out, Tables, Options),
	write_footer(Out, Options).

write_codepage(Out, CP, Map) :-
	assertion(length(Map, 256)),
	cp_name(CP, CPN),
	format(Out, 'static const char ~w[256] =~n', [CPN]),
	format(Out, '{ ', []),
	map_entries(Map, 0, Out),
	format(Out, '~N};~n~n', []).

cp_name(CP, CPN) :-
	format(atom(CPN), 'ucp0x~|~`0t~16r~2+', [CP]).

map_entries([], _, _).
map_entries([H|T], I, Out) :-
	(   I == 0
	->  true
	;   0 =:= I mod 8
	->  format(Out, ',~n  ', [])
	;   format(Out, ', ', [])
	),
	format(Out, '~|0x~`0t~16r~2+', [H]),
	I2 is I + 1,
	map_entries(T, I2, Out).

write_map(Out, Tables, Options) :-
	last_unicode_page(DefLast),
	option(last_codepage(Last), Options, DefLast),
	format(Out,
	       'static const char* const uflags_map[UNICODE_MAP_SIZE] =~n',
	       []),
	format(Out, '{ ', []),
	map_tables(0, Last, Tables, Out),
	format(Out, '~N};~n~n', []).

map_tables(CP, Last, _, _) :-
	CP > Last, !.
map_tables(CP, Last, Tables, Out) :-
	(   CP == 0
	->  true
	;   0 =:= CP mod 8
	->  format(Out, ',~n  ', [])
	;   format(Out, ', ', [])
	),
	memberchk(table(CP, Map), Tables),
	(   is_list(Map)
	->  cp_name(CP, CPN),
	    format(Out, '~w', [CPN])
	;   format(Out, '~|~tF(0x~16r)~7+', [Map])
	),
	CP2 is CP + 1,
	map_tables(CP2, Last, Tables, Out).


write_header(Out, Options) :-
	last_unicode_page(DefLast),
	option(last_codepage(Last), Options, DefLast),
	Size is Last+1,
	format(Out, '/*  Generated file.  Do not edit!\n    \c
		         Generated by Unicode/prolog_syntax_map.pl\n\c
		     */~n~n', []),
	format(Out, '#define UNICODE_MAP_SIZE ~d~n', [Size]),
	format(Out, '#define F(c) (const char*)(c)~n~n', [Size]),
	(   flag_name(Name, Hex),
	    upcase_atom(Name, Up),
	    format(Out, '#define U_~w~t0x~16r~32|~n', [Up, Hex]),
	    fail ; true
	),
	format(Out, '~n~n', []).


write_footer(Out, _Options) :-
	format(Out,
	       'static int\n\c
		uflagsW(int code)\n\c
		{ int cp = (unsigned)code / 256;\n\c
		\n  \c
		  if ( cp < UNICODE_MAP_SIZE )\n  \c
		  { const char *s = uflags_map[cp];\n    \c
		    if ( s < (const char *)256 )\n      \c
		      return (int)(intptr_t)s;\n    \c
		    return s[code&0xff];\n  \c
		  }\n  \c
		  return 0;\n\c
		}\n\n', []).


		 /*******************************
		 *	       TABLES		*
		 *******************************/

%%	gen_tables(-Tables, +Options)
%
%	Table is of  the  format  below,   where  CodePage  is  the page
%	(0..255) for 16-bit Unicode and  ValueList   are  the values for
%	each character.
%
%		table(CodePage, ValueList)

gen_tables(Tables, Options) :-
	findall(table(CP,Map), table(CP, Map, Options), Tables).

table(CP, Map, Options) :-
	last_unicode_page(DefPage),
	option(first_codepage(First), Options, 0),
	option(last_codepage(Last), Options, DefPage),
	between(First, Last, CP),
	findall(M, char(CP, M, Options), Map0),
	flat_map(Map0, Map).

char(CP, Value, _Options) :-
	between(0, 255, I),
	Code is 256*CP+I,
	code_flags(Code, Value).

code_flags(Code, Value) :-
	findall(F, flag(Code, F), Fs),
	or(Fs, Value).

or([], 0).
or([H|T], F) :-
	or(T, F0),
	F is F0 \/ H.

flag(Code, Flag) :-
	flag_name(Name, Flag),
	code_flag(Code, Name).

flag_name(id_start,    0x01).
flag_name(id_continue, 0x02).
flag_name(uppercase,   0x04).
flag_name(separator,   0x08).
flag_name(symbol,      0x10).
flag_name(other,       0x20).
flag_name(control,     0x40).

code_flag(C, id_start) :-    unicode_derived_core_property(C, id_start).
code_flag(C, id_continue) :- unicode_derived_core_property(C, id_continue).
code_flag(C, uppercase) :-   unicode_derived_core_property(C, uppercase).
code_flag(C, separator) :-
	unicode_property(C, general_category(Cat)),
	sep_cat(Cat).
code_flag(C, symbol) :-
	unicode_property(C, general_category(Cat)),
	symbol_cat(Cat).
code_flag(C, other) :-
	unicode_property(C, general_category(Cat)),
	other_cat(Cat).
code_flag(C, control) :-
	unicode_property(C, general_category(Cat)),
	control_cat(Cat).
code_flag(C, unassigned) :-
	\+ unicode_property(C, general_category(_)).

% See http://www.unicode.org/reports/tr44/#Property_Values

sep_cat('Zs').		% a space character (of various non-zero widths)
sep_cat('Zl').		% U+2028 LINE SEPARATOR only
sep_cat('Zp').		% U+2029 PARAGRAPH SEPARATOR only

symbol_cat('Sm').	% a symbol of primarily mathematical use
symbol_cat('Sc').	% a currency sign
symbol_cat('Sk').	% a non-letterlike modifier symbol
symbol_cat('So').	% a symbol of other type
symbol_cat('Pc').	% a connecting punctuation mark, like a tie
symbol_cat('Pd').	% a dash or hyphen punctuation mark
symbol_cat('Ps').	% an opening punctuation mark (of a pair)
symbol_cat('Pe').	% a closing punctuation mark (of a pair)
symbol_cat('Pi').	% an initial quotation mark
symbol_cat('Pf').	% a final quotation mark
symbol_cat('Po').	% a punctuation mark of other type

other_cat('No').	% a numeric character of other type
other_cat('Me').	% an enclosing combining mark

control_cat('Cc').	% a C0 or C1 control code
control_cat('Cf').	% a format control character
control_cat('Cs').	% a surrogate code point
control_cat('Co').	% a private-use character
control_cat('Cn').	% a reserved unassigned code point or a noncharacter


flat_map(Map0, Value) :-
	sort(Map0, [Value]), !.
flat_map(Map, Map).
