XML and UNICODE - a quarrelling couple
Obviously xml has nothing to do with x-mas...
This is my input file (UTF-8 with BOM):
<?xml version="1.0" encoding="UTF-8"?>
<saves>
<!-- is Does the prolog help for Unicode support? -->
<item>
<!-- what happens to this & that comment? -->
<name>Wäßrige lösung</name>
<info>Wenig substanz wird in viel wasser gelöst.</info>
...
This is my simple script:
// WriteXmlData-b.jsx
// Read UTF data and write it back
#target framemaker
main ();
function main () {
var oXmlSettings, xmlData, sXmlFile = "WriteXml.xml"; // file exists in same dir as this script;
$.bp(true);
xmlData = GetXMLdata (sXmlFile);
WriteXMLdata (xmlData, sXmlFile);
} // --- End main --------------------------------
function WriteXMLdata (xmlData, sXmlFile) { // =============================================
var fXmlFile;
fXmlFile = new File($.fileName.replace (/[^\\\/]+$/i , sXmlFile));
try {
fXmlFile.open("w");
fXmlFile.write(xmlData);
fXmlFile.close();
} catch (e) {
alert("" + e.message + "\nThere are problems writing the XML file!");
}
return true;
} // --- End WriteXML ----------------------------
function GetXMLdata (sXmlFile) { // =======================================================
var fXmlFile, xData;
if ((sXmlFile == null) || (sXmlFile == undefined)) {
return false;
} else {
fXmlFile = new File($.fileName.replace (/[^\\\/]+$/i , sXmlFile)); // file in script folder
}
if (fXmlFile.exists === false) {
alert ("XML file «" + sXmlFile + "» not found.");
return false;
}
fXmlFile.open("r");
try {
xData = new XML(fXmlFile.read());
fXmlFile.close();
return xData;
} catch (e) {
alert ("Read error on XML file «" + sXmlFile + "». Error:\n "+ e); fXmlFile.close();
return false;
}
} //--- end getXMLdata -----------------------------
When running this program step by step I see the internal presentation

After writing I see in the file (it is still UTF-8 with BOM):

- Where is the prologue?
- Why do I have Chinese characters? The strange characters actually are u4000, u07C0 and u6000.
- What is UNICODE good for XML if it is necessary to use entities for all but the ASCII characters?

