Thursday, 20 February 2014

Create list of files using XSLT

Issue - need a directory listing of xml files using purely XSLT

Resolution - create an XSLT with the following code, name this as lister.xsl and use the Saxon initial template option -it to call the template

java -jar saxon.jar -it:lister -xsl:lister.xsl -o:filelist.xml

<xsl:stylesheet version="2.0" xmlns:xsl=""

  <xsl:template name="lister">
        select="collection('./?select=*.xml;recurse=yes;on-error=warning')" >
 <xsl:element name='file'>
   <xsl:attribute name="full" select="document-uri(.)"/>
   <xsl:value-of select="tokenize(document-uri(.), '/')[last()]"/>

Convert unescaped html to xml

XSLT stylesheet to convert unescaped HTML to XML

This was used to extract the html from a blogger feed which was used across

<xsl:stylesheet version="2.0" xmlns:xsl=""



 <xsl:output omit-xml-declaration="yes" indent="yes"  />

 <xsl:variable name="htmlEntities" as="element()*">
  <entity name="&amp;nbsp;" char="&#160;"/>  <!-- no-break space -->
  <entity name="&amp;iexcl;" char="&#161;"/>  <!-- inverted exclamation mark -->
  <entity name="&amp;cent;" char="&#162;"/>  <!-- cent sign -->
  <entity name="&amp;pound;" char="&#163;"/>  <!-- pound sterling sign -->
  <entity name="&amp;curren;" char="&#164;"/>  <!-- general currency sign -->
  <entity name="&amp;yen;" char="&#165;"/>  <!-- yen sign -->
  <entity name="&amp;brvbar;" char="&#166;"/>  <!-- broken (vertical) bar -->
  <entity name="&amp;sect;" char="&#167;"/>  <!-- section sign -->
  <entity name="&amp;uml;" char="&#168;"/>  <!-- umlaut (dieresis) -->
  <entity name="&amp;copy;" char="&#169;"/>  <!-- copyright sign -->
  <entity name="&amp;ordf;" char="&#170;"/>  <!-- ordinal indicator, feminine -->
  <entity name="&amp;laquo;" char="&#171;"/>  <!-- angle quotation mark, left -->
  <entity name="&amp;not;" char="&#172;"/>  <!-- not sign -->
  <entity name="&amp;shy;" char="&#173;"/>  <!-- soft hyphen -->
  <entity name="&amp;reg;" char="&#174;"/>  <!-- registered sign -->
  <entity name="&amp;macr;" char="&#175;"/>  <!-- macron -->
  <entity name="&amp;deg;" char="&#176;"/>  <!-- degree sign -->
  <entity name="&amp;plusmn;" char="&#177;"/>  <!-- plus-or-minus sign -->
  <entity name="&amp;sup2;" char="&#178;"/>  <!-- superscript two -->
  <entity name="&amp;sup3;" char="&#179;"/>  <!-- superscript three -->
  <entity name="&amp;acute;" char="&#180;"/>  <!-- acute accent -->
  <entity name="&amp;micro;" char="&#181;"/>  <!-- micro sign -->
  <entity name="&amp;para;" char="&#182;"/>  <!-- pilcrow (paragraph sign) -->
  <entity name="&amp;middot;" char="&#183;"/>  <!-- middle dot -->
  <entity name="&amp;cedil;" char="&#184;"/>  <!-- cedilla -->
  <entity name="&amp;sup1;" char="&#185;"/>  <!-- superscript one -->
  <entity name="&amp;ordm;" char="&#186;"/>  <!-- ordinal indicator, masculine -->
  <entity name="&amp;raquo;" char="&#187;"/>  <!-- angle quotation mark, right -->
  <entity name="&amp;frac14;" char="&#188;"/>  <!-- fraction one-quarter -->
  <entity name="&amp;frac12;" char="&#189;"/>  <!-- fraction one-half -->
  <entity name="&amp;frac34;" char="&#190;"/>  <!-- fraction three-quarters -->
  <entity name="&amp;iquest;" char="&#191;"/>  <!-- inverted question mark -->
  <entity name="&amp;Agrave;" char="&#192;"/>  <!-- capital A, grave accent -->
  <entity name="&amp;Aacute;" char="&#193;"/>  <!-- capital A, acute accent -->
  <entity name="&amp;Acirc;" char="&#194;"/>  <!-- capital A, circumflex accent -->
  <entity name="&amp;Atilde;" char="&#195;"/>  <!-- capital A, tilde -->
  <entity name="&amp;Auml;" char="&#196;"/>  <!-- capital A, dieresis or umlaut mark -->
  <entity name="&amp;Aring;" char="&#197;"/>  <!-- capital A, ring -->
  <entity name="&amp;AElig;" char="&#198;"/>  <!-- capital AE diphthong (ligature) -->
  <entity name="&amp;Ccedil;" char="&#199;"/>  <!-- capital C, cedilla -->
  <entity name="&amp;Egrave;" char="&#200;"/>  <!-- capital E, grave accent -->
  <entity name="&amp;Eacute;" char="&#201;"/>  <!-- capital E, acute accent -->
  <entity name="&amp;Ecirc;" char="&#202;"/>  <!-- capital E, circumflex accent -->
  <entity name="&amp;Euml;" char="&#203;"/>  <!-- capital E, dieresis or umlaut mark -->
  <entity name="&amp;Igrave;" char="&#204;"/>  <!-- capital I, grave accent -->
  <entity name="&amp;Iacute;" char="&#205;"/>  <!-- capital I, acute accent -->
  <entity name="&amp;Icirc;" char="&#206;"/>  <!-- capital I, circumflex accent -->
  <entity name="&amp;Iuml;" char="&#207;"/>  <!-- capital I, dieresis or umlaut mark -->
  <entity name="&amp;ETH;" char="&#208;"/>  <!-- capital Eth, Icelandic -->
  <entity name="&amp;Ntilde;" char="&#209;"/>  <!-- capital N, tilde -->
  <entity name="&amp;Ograve;" char="&#210;"/>  <!-- capital O, grave accent -->
  <entity name="&amp;Oacute;" char="&#211;"/>  <!-- capital O, acute accent -->
  <entity name="&amp;Ocirc;" char="&#212;"/>  <!-- capital O, circumflex accent -->
  <entity name="&amp;Otilde;" char="&#213;"/>  <!-- capital O, tilde -->
  <entity name="&amp;Ouml;" char="&#214;"/>  <!-- capital O, dieresis or umlaut mark -->
  <entity name="&amp;times;" char="&#215;"/>  <!-- multiply sign -->
  <entity name="&amp;Oslash;" char="&#216;"/>  <!-- capital O, slash -->
  <entity name="&amp;Ugrave;" char="&#217;"/>  <!-- capital U, grave accent -->
  <entity name="&amp;Uacute;" char="&#218;"/>  <!-- capital U, acute accent -->
  <entity name="&amp;Ucirc;" char="&#219;"/>  <!-- capital U, circumflex accent -->
  <entity name="&amp;Uuml;" char="&#220;"/>  <!-- capital U, dieresis or umlaut mark -->
  <entity name="&amp;Yacute;" char="&#221;"/>  <!-- capital Y, acute accent -->
  <entity name="&amp;THORN;" char="&#222;"/>  <!-- capital THORN, Icelandic -->
  <entity name="&amp;szlig;" char="&#223;"/>  <!-- small sharp s, German (sz ligature) -->
  <entity name="&amp;agrave;" char="&#224;"/>  <!-- small a, grave accent -->
  <entity name="&amp;aacute;" char="&#225;"/>  <!-- small a, acute accent -->
  <entity name="&amp;acirc;" char="&#226;"/>  <!-- small a, circumflex accent -->
  <entity name="&amp;atilde;" char="&#227;"/>  <!-- small a, tilde -->
  <entity name="&amp;auml;" char="&#228;"/>  <!-- small a, dieresis or umlaut mark -->
  <entity name="&amp;aring;" char="&#229;"/>  <!-- small a, ring -->
  <entity name="&amp;aelig;" char="&#230;"/>  <!-- small ae diphthong (ligature) -->
  <entity name="&amp;ccedil;" char="&#231;"/>  <!-- small c, cedilla -->
  <entity name="&amp;egrave;" char="&#232;"/>  <!-- small e, grave accent -->
  <entity name="&amp;eacute;" char="&#233;"/>  <!-- small e, acute accent -->
  <entity name="&amp;ecirc;" char="&#234;"/>  <!-- small e, circumflex accent -->
  <entity name="&amp;euml;" char="&#235;"/>  <!-- small e, dieresis or umlaut mark -->
  <entity name="&amp;igrave;" char="&#236;"/>  <!-- small i, grave accent -->
  <entity name="&amp;iacute;" char="&#237;"/>  <!-- small i, acute accent -->
  <entity name="&amp;icirc;" char="&#238;"/>  <!-- small i, circumflex accent -->
  <entity name="&amp;iuml;" char="&#239;"/>  <!-- small i, dieresis or umlaut mark -->
  <entity name="&amp;eth;" char="&#240;"/>  <!-- small eth, Icelandic -->
  <entity name="&amp;ntilde;" char="&#241;"/>  <!-- small n, tilde -->
  <entity name="&amp;ograve;" char="&#242;"/>  <!-- small o, grave accent -->
  <entity name="&amp;oacute;" char="&#243;"/>  <!-- small o, acute accent -->
  <entity name="&amp;ocirc;" char="&#244;"/>  <!-- small o, circumflex accent -->
  <entity name="&amp;otilde;" char="&#245;"/>  <!-- small o, tilde -->
  <entity name="&amp;ouml;" char="&#246;"/>  <!-- small o, dieresis or umlaut mark -->
  <entity name="&amp;divide;" char="&#247;"/>  <!-- divide sign -->
  <entity name="&amp;oslash;" char="&#248;"/>  <!-- small o, slash -->
  <entity name="&amp;ugrave;" char="&#249;"/>  <!-- small u, grave accent -->
  <entity name="&amp;uacute;" char="&#250;"/>  <!-- small u, acute accent -->
  <entity name="&amp;ucirc;" char="&#251;"/>  <!-- small u, circumflex accent -->
  <entity name="&amp;uuml;" char="&#252;"/>  <!-- small u, dieresis or umlaut mark -->
  <entity name="&amp;yacute;" char="&#253;"/>  <!-- small y, acute accent -->
  <entity name="&amp;thorn;" char="&#254;"/>  <!-- small thorn, Icelandic -->
  <entity name="&amp;yuml;" char="&#255;"/>  <!-- small y, dieresis or umlaut mark -->
  <!-- Latin Extended-B -->
  <entity name="&amp;fnof;" char="&#402;"/>  <!-- latin small f with hook, =function, =florin, u+0192 ISOtech -->

  <!-- Greek -->
  <entity name="&amp;Alpha;" char="&#913;"/>  <!-- greek capital letter alpha,  u+0391 -->
  <entity name="&amp;Beta;" char="&#914;"/>  <!-- greek capital letter beta,  u+0392 -->
  <entity name="&amp;Gamma;" char="&#915;"/>  <!-- greek capital letter gamma,  u+0393 ISOgrk3 -->
  <entity name="&amp;Delta;" char="&#916;"/>  <!-- greek capital letter delta,  u+0394 ISOgrk3 -->
  <entity name="&amp;Epsilon;" char="&#917;"/>  <!-- greek capital letter epsilon,  u+0395 -->
  <entity name="&amp;Zeta;" char="&#918;"/>  <!-- greek capital letter zeta,  u+0396 -->
  <entity name="&amp;Eta;" char="&#919;"/>  <!-- greek capital letter eta,  u+0397 -->
  <entity name="&amp;Theta;" char="&#920;"/>  <!-- greek capital letter theta,  u+0398 ISOgrk3 -->
  <entity name="&amp;Iota;" char="&#921;"/>  <!-- greek capital letter iota,  u+0399 -->
  <entity name="&amp;Kappa;" char="&#922;"/>  <!-- greek capital letter kappa,  u+039A -->
  <entity name="&amp;Lambda;" char="&#923;"/>  <!-- greek capital letter lambda,  u+039B ISOgrk3 -->
  <entity name="&amp;Mu;" char="&#924;"/>  <!-- greek capital letter mu,  u+039C -->
  <entity name="&amp;Nu;" char="&#925;"/>  <!-- greek capital letter nu,  u+039D -->
  <entity name="&amp;Xi;" char="&#926;"/>  <!-- greek capital letter xi,  u+039E ISOgrk3 -->
  <entity name="&amp;Omicron;" char="&#927;"/>  <!-- greek capital letter omicron,  u+039F -->
  <entity name="&amp;Pi;" char="&#928;"/>  <!-- greek capital letter pi,  u+03A0 ISOgrk3 -->
  <entity name="&amp;Rho;" char="&#929;"/>  <!-- greek capital letter rho,  u+03A1 -->
  <!-- (there is no Sigmaf, and no u+03A2 character either) -->
  <entity name="&amp;Sigma;" char="&#931;"/>  <!-- greek capital letter sigma,  u+03A3 ISOgrk3 -->
  <entity name="&amp;Tau;" char="&#932;"/>  <!-- greek capital letter tau,  u+03A4 -->
  <entity name="&amp;Upsilon;" char="&#933;"/>  <!-- greek capital letter upsilon,  u+03A5 ISOgrk3 -->
  <entity name="&amp;Phi;" char="&#934;"/>  <!-- greek capital letter phi,  u+03A6 ISOgrk3 -->
  <entity name="&amp;Chi;" char="&#935;"/>  <!-- greek capital letter chi,  u+03A7 -->
  <entity name="&amp;Psi;" char="&#936;"/>  <!-- greek capital letter psi,  u+03A8 ISOgrk3 -->
  <entity name="&amp;Omega;" char="&#937;"/>  <!-- greek capital letter omega,  u+03A9 ISOgrk3 -->

  <entity name="&amp;alpha;" char="&#945;"/>  <!-- greek small letter alpha, u+03B1 ISOgrk3 -->
  <entity name="&amp;beta;" char="&#946;"/>  <!-- greek small letter beta,  u+03B2 ISOgrk3 -->
  <entity name="&amp;gamma;" char="&#947;"/>  <!-- greek small letter gamma,  u+03B3 ISOgrk3 -->
  <entity name="&amp;delta;" char="&#948;"/>  <!-- greek small letter delta,  u+03B4 ISOgrk3 -->
  <entity name="&amp;epsilon;" char="&#949;"/>  <!-- greek small letter epsilon,  u+03B5 ISOgrk3 -->
  <entity name="&amp;zeta;" char="&#950;"/>  <!-- greek small letter zeta,  u+03B6 ISOgrk3 -->
  <entity name="&amp;eta;" char="&#951;"/>  <!-- greek small letter eta,  u+03B7 ISOgrk3 -->
  <entity name="&amp;theta;" char="&#952;"/>  <!-- greek small letter theta,  u+03B8 ISOgrk3 -->
  <entity name="&amp;iota;" char="&#953;"/>  <!-- greek small letter iota,  u+03B9 ISOgrk3 -->
  <entity name="&amp;kappa;" char="&#954;"/>  <!-- greek small letter kappa,  u+03BA ISOgrk3 -->
  <entity name="&amp;lambda;" char="&#955;"/>  <!-- greek small letter lambda,  u+03BB ISOgrk3 -->
  <entity name="&amp;mu;" char="&#956;"/>  <!-- greek small letter mu,  u+03BC ISOgrk3 -->
  <entity name="&amp;nu;" char="&#957;"/>  <!-- greek small letter nu,  u+03BD ISOgrk3 -->
  <entity name="&amp;xi;" char="&#958;"/>  <!-- greek small letter xi,  u+03BE ISOgrk3 -->
  <entity name="&amp;omicron;" char="&#959;"/>  <!-- greek small letter omicron,  u+03BF NEW -->
  <entity name="&amp;pi;" char="&#960;"/>  <!-- greek small letter pi,  u+03C0 ISOgrk3 -->
  <entity name="&amp;rho;" char="&#961;"/>  <!-- greek small letter rho,  u+03C1 ISOgrk3 -->
  <entity name="&amp;sigmaf;" char="&#962;"/>  <!-- greek small letter final sigma,  u+03C2 ISOgrk3 -->
  <entity name="&amp;sigma;" char="&#963;"/>  <!-- greek small letter sigma,  u+03C3 ISOgrk3 -->
  <entity name="&amp;tau;" char="&#964;"/>  <!-- greek small letter tau,  u+03C4 ISOgrk3 -->
  <entity name="&amp;upsilon;" char="&#965;"/>  <!-- greek small letter upsilon,  u+03C5 ISOgrk3 -->
  <entity name="&amp;phi;" char="&#966;"/>  <!-- greek small letter phi,  u+03C6 ISOgrk3 -->
  <entity name="&amp;chi;" char="&#967;"/>  <!-- greek small letter chi,  u+03C7 ISOgrk3 -->
  <entity name="&amp;psi;" char="&#968;"/>  <!-- greek small letter psi,  u+03C8 ISOgrk3 -->
  <entity name="&amp;omega;" char="&#969;"/>  <!-- greek small letter omega,  u+03C9 ISOgrk3 -->
  <entity name="&amp;thetasym;" char="&#977;"/>  <!-- greek small letter theta symbol,  u+03D1 NEW -->
  <entity name="&amp;upsih;" char="&#978;"/>  <!-- greek upsilon with hook symbol,  u+03D2 NEW -->
  <entity name="&amp;piv;" char="&#982;"/>  <!-- greek pi symbol,  u+03D6 ISOgrk3 -->

  <!-- General Punctuation -->
  <entity name="&amp;bull;" char="&#8226;"/>  <!-- bullet, =black small circle, u+2022 ISOpub  -->
  <!-- bullet is NOT the same as bullet operator, u+2219 -->
  <entity name="&amp;hellip;" char="&#8230;"/>  <!-- horizontal ellipsis, =three dot leader, u+2026 ISOpub  -->
  <entity name="&amp;prime;" char="&#8242;"/>  <!-- prime, =minutes, =feet, u+2032 ISOtech -->
  <entity name="&amp;Prime;" char="&#8243;"/>  <!-- double prime, =seconds, =inches, u+2033 ISOtech -->
  <entity name="&amp;oline;" char="&#8254;"/>  <!-- overline, =spacing overscore, u+203E NEW -->
  <entity name="&amp;frasl;" char="&#8260;"/>  <!-- fraction slash, u+2044 NEW -->
  <!-- Letterlike Symbols -->
  <entity name="&amp;weierp;" char="&#8472;"/>  <!-- script capital P, =power set, =Weierstrass p, u+2118 ISOamso -->
  <entity name="&amp;image;" char="&#8465;"/>  <!-- blackletter capital I, =imaginary part, u+2111 ISOamso -->
  <entity name="&amp;real;" char="&#8476;"/>  <!-- blackletter capital R, =real part symbol, u+211C ISOamso -->
  <entity name="&amp;trade;" char="&#8482;"/>  <!-- trade mark sign, u+2122 ISOnum -->
  <entity name="&amp;alefsym;" char="&#8501;"/>  <!-- alef symbol, =first transfinite cardinal, u+2135 NEW -->
  <!-- alef symbol is NOT the same as hebrew letter alef, u+05D0 although the same glyph
     could be used to depict both characters -->

  <!-- Arrows -->
  <entity name="&amp;larr;" char="&#8592;"/>  <!-- leftwards arrow, u+2190 ISOnum -->
  <entity name="&amp;uarr;" char="&#8593;"/>  <!-- upwards arrow, u+2191 ISOnum-->
  <entity name="&amp;rarr;" char="&#8594;"/>  <!-- rightwards arrow, u+2192 ISOnum -->
  <entity name="&amp;darr;" char="&#8595;"/>  <!-- downwards arrow, u+2193 ISOnum -->
  <entity name="&amp;harr;" char="&#8596;"/>  <!-- left right arrow, u+2194 ISOamsa -->
  <entity name="&amp;crarr;" char="&#8629;"/>  <!-- downwards arrow with corner leftwards, =carriage return, u+21B5 NEW -->
  <entity name="&amp;lArr;" char="&#8656;"/>  <!-- leftwards double arrow, u+21D0 ISOtech -->
  <!-- Unicode does not say that lArr is the same as the 'is implied by' arrow but also 
     does not have any other character for that function. So ? lArr can be used for 
     'is implied by' as ISOtech suggests -->
  <entity name="&amp;uArr;" char="&#8657;"/>  <!-- upwards double arrow, u+21D1 ISOamsa -->
  <entity name="&amp;rArr;" char="&#8658;"/>  <!-- rightwards double arrow, u+21D2 ISOtech -->
  <!-- Unicode does not say this is the 'implies' character but does not have another 
     character with this function so ? rArr can be used for 'implies' as ISOtech suggests -->
  <entity name="&amp;dArr;" char="&#8659;"/>  <!-- downwards double arrow, u+21D3 ISOamsa -->
  <entity name="&amp;hArr;" char="&#8660;"/>  <!-- left right double arrow, u+21D4 ISOamsa -->

  <!-- Mathematical Operators -->
  <entity name="&amp;forall;" char="&#8704;"/>  <!-- for all, u+2200 ISOtech -->
  <entity name="&amp;part;" char="&#8706;"/>  <!-- partial differential, u+2202 ISOtech  -->
  <entity name="&amp;exist;" char="&#8707;"/>  <!-- there exists, u+2203 ISOtech -->
  <entity name="&amp;empty;" char="&#8709;"/>  <!-- empty set, =null set, =diameter, u+2205 ISOamso -->
  <entity name="&amp;nabla;" char="&#8711;"/>  <!-- nabla, =backward difference, u+2207 ISOtech -->
  <entity name="&amp;isin;" char="&#8712;"/>  <!-- element of, u+2208 ISOtech -->
  <entity name="&amp;notin;" char="&#8713;"/>  <!-- not an element of, u+2209 ISOtech -->
  <entity name="&amp;ni;" char="&#8715;"/>  <!-- contains as member, u+220B ISOtech -->
  <!-- should there be a more memorable name than 'ni'? -->
  <entity name="&amp;prod;" char="&#8719;"/>  <!-- n-ary product, =product sign, u+220F ISOamsb -->
  <!-- prod is NOT the same character as u+03A0 'greek capital letter pi' though the same 
     glyph might be used for both -->
  <entity name="&amp;sum;" char="&#8721;"/>  <!-- n-ary sumation, u+2211 ISOamsb -->
  <!-- sum is NOT the same character as u+03A3 'greek capital letter sigma' though the same 
     glyph might be used for both -->
  <entity name="&amp;minus;" char="&#8722;"/>  <!-- minus sign, u+2212 ISOtech -->
  <entity name="&amp;lowast;" char="&#8727;"/>  <!-- asterisk operator, u+2217 ISOtech -->
  <entity name="&amp;radic;" char="&#8730;"/>  <!-- square root, =radical sign, u+221A ISOtech -->
  <entity name="&amp;prop;" char="&#8733;"/>  <!-- proportional to, u+221D ISOtech -->
  <entity name="&amp;infin;" char="&#8734;"/>  <!-- infinity, u+221E ISOtech -->
  <entity name="&amp;ang;" char="&#8736;"/>  <!-- angle, u+2220 ISOamso -->
  <entity name="&amp;and;" char="&#8869;"/>  <!-- logical and, =wedge, u+2227 ISOtech -->
  <entity name="&amp;or;" char="&#8870;"/>  <!-- logical or, =vee, u+2228 ISOtech -->
  <entity name="&amp;cap;" char="&#8745;"/>  <!-- intersection, =cap, u+2229 ISOtech -->
  <entity name="&amp;cup;" char="&#8746;"/>  <!-- union, =cup, u+222A ISOtech -->
  <entity name="&amp;int;" char="&#8747;"/>  <!-- integral, u+222B ISOtech -->
  <entity name="&amp;there4;" char="&#8756;"/>  <!-- therefore, u+2234 ISOtech -->
  <entity name="&amp;sim;" char="&#8764;"/>  <!-- tilde operator, =varies with, =similar to, u+223C ISOtech -->
  <!-- tilde operator is NOT the same character as the tilde, u+007E, although the same 
     glyph might be used to represent both  -->
  <entity name="&amp;cong;" char="&#8773;"/>  <!-- approximately equal to, u+2245 ISOtech -->
  <entity name="&amp;asymp;" char="&#8776;"/>  <!-- almost equal to, =asymptotic to, u+2248 ISOamsr -->
  <entity name="&amp;ne;" char="&#8800;"/>  <!-- not equal to, u+2260 ISOtech -->
  <entity name="&amp;equiv;" char="&#8801;"/>  <!-- identical to, u+2261 ISOtech -->
  <entity name="&amp;le;" char="&#8804;"/>  <!-- less-than or equal to, u+2264 ISOtech -->
  <entity name="&amp;ge;" char="&#8805;"/>  <!-- greater-than or equal to, u+2265 ISOtech -->
  <entity name="&amp;sub;" char="&#8834;"/>  <!-- subset of, u+2282 ISOtech -->
  <entity name="&amp;sup;" char="&#8835;"/>  <!-- superset of, u+2283 ISOtech -->
  <!-- note that nsup, 'not a superset of, u+2283' is not covered by the Symbol font 
     encoding and is not included. Should it be, for symmetry? It is in ISOamsn  --> 
  <entity name="&amp;nsub;" char="&#8836;"/>  <!-- not a subset of, u+2284 ISOamsn -->
  <entity name="&amp;sube;" char="&#8838;"/>  <!-- subset of or equal to, u+2286 ISOtech -->
  <entity name="&amp;supe;" char="&#8839;"/>  <!-- superset of or equal to, u+2287 ISOtech -->
  <entity name="&amp;oplus;" char="&#8853;"/>  <!-- circled plus, =direct sum, u+2295 ISOamsb -->
  <entity name="&amp;otimes;" char="&#8855;"/>  <!-- circled times, =vector product, u+2297 ISOamsb -->
  <entity name="&amp;perp;" char="&#8869;"/>  <!-- up tack, =orthogonal to, =perpendicular, u+22A5 ISOtech -->
  <entity name="&amp;sdot;" char="&#8901;"/>  <!-- dot operator, u+22C5 ISOamsb -->
  <!-- dot operator is NOT the same character as u+00B7 middle dot -->

  <!-- Miscellaneous Technical -->
  <entity name="&amp;lceil;" char="&#8968;"/>  <!-- left ceiling, =apl upstile, u+2308, ISOamsc  -->
  <entity name="&amp;rceil;" char="&#8969;"/>  <!-- right ceiling, u+2309, ISOamsc  -->
  <entity name="&amp;lfloor;" char="&#8970;"/>  <!-- left floor, =apl downstile, u+230A, ISOamsc  -->
  <entity name="&amp;rfloor;" char="&#8971;"/>  <!-- right floor, u+230B, ISOamsc  -->
  <entity name="&amp;lang;" char="&#9001;"/>  <!-- left-pointing angle bracket, =bra, u+2329 ISOtech -->
  <!-- lang is NOT the same character as u+003C 'less than' 
     or u+2039 'single left-pointing angle quotation mark' -->
  <entity name="&amp;rang;" char="&#9002;"/>  <!-- right-pointing angle bracket, =ket, u+232A ISOtech -->
  <!-- rang is NOT the same character as u+003E 'greater than' 
     or u+203A 'single right-pointing angle quotation mark' -->

  <!-- Geometric Shapes -->
  <entity name="&amp;loz;" char="&#9674;"/>  <!-- lozenge, u+25CA ISOpub -->

  <!-- Miscellaneous Symbols -->
  <entity name="&amp;spades;" char="&#9824;"/>  <!-- black spade suit, u+2660 ISOpub -->
  <!-- black here seems to mean filled as opposed to hollow -->
  <entity name="&amp;clubs;" char="&#9827;"/>  <!-- black club suit, =shamrock, u+2663 ISOpub -->
  <entity name="&amp;hearts;" char="&#9829;"/>  <!-- black heart suit, =valentine, u+2665 ISOpub -->
  <entity name="&amp;diams;" char="&#9830;"/>  <!-- black diamond suit, u+2666 ISOpub -->
  <!-- C0 Controls and Basic Latin -->
  <entity name="&amp;quot;" char="&#34;"/>  <!--  quotation mark, =apl quote, u+0022 ISOnum -->
  <entity name="&amp;amp;" char="&#38;amp;"/>  <!--  ampersand, u+0026 ISOnum -->
  <entity name="&amp;lt;" char="&#60;"/>  <!--  less-than sign, u+003C ISOnum -->
  <entity name="&amp;gt;" char="&#62;"/>  <!--  greater-than sign, u+003E ISOnum -->

  <!-- Latin Extended-A -->
  <entity name="&amp;OElig;" char="&#338;"/>  <!--  latin capital ligature oe, u+0152 ISOlat2 -->
  <entity name="&amp;oelig;" char="&#339;"/>  <!--  latin small ligature oe, u+0153 ISOlat2 -->
  <!-- ligature is a misnomer, this is a separate character in some languages -->
  <entity name="&amp;Scaron;" char="&#352;"/>  <!--  latin capital letter s with caron, u+0160 ISOlat2 -->
  <entity name="&amp;scaron;" char="&#353;"/>  <!--  latin small letter s with caron, u+0161 ISOlat2 -->
  <entity name="&amp;Yuml;" char="&#376;"/>  <!--  latin capital letter y with diaeresis, u+0178 ISOlat2 -->

  <!-- Spacing Modifier Letters -->
  <entity name="&amp;circ;" char="&#710;"/>  <!--  modifier letter circumflex accent, u+02C6 ISOpub -->
  <entity name="&amp;tilde;" char="&#732;"/>  <!--  small tilde, u+02DC ISOdia -->

  <!-- General Punctuation -->
  <entity name="&amp;ensp;" char="&#8194;"/>  <!--  en space, u+2002 ISOpub -->
  <entity name="&amp;emsp;" char="&#8195;"/>  <!--  em space, u+2003 ISOpub -->
  <entity name="&amp;thinsp;" char="&#8201;"/>  <!--  thin space, u+2009 ISOpub -->
  <entity name="&amp;zwnj;" char="&#8204;"/>  <!--  zero width non-joiner, u+200C NEW RFC 2070 -->
  <entity name="&amp;zwj;" char="&#8205;"/>  <!--  zero width joiner, u+200D NEW RFC 2070 -->
  <entity name="&amp;lrm;" char="&#8206;"/>  <!--  left-to-right mark, u+200E NEW RFC 2070 -->
  <entity name="&amp;rlm;" char="&#8207;"/>  <!--  right-to-left mark, u+200F NEW RFC 2070 -->
  <entity name="&amp;ndash;" char="&#8211;"/>  <!--  en dash, u+2013 ISOpub -->
  <entity name="&amp;mdash;" char="&#8212;"/>  <!--  em dash, u+2014 ISOpub -->
  <entity name="&amp;lsquo;" char="&#8216;"/>  <!--  left single quotation mark, u+2018 ISOnum -->
  <entity name="&amp;rsquo;" char="&#8217;"/>  <!--  right single quotation mark, u+2019 ISOnum -->
  <entity name="&amp;sbquo;" char="&#8218;"/>  <!--  single low-9 quotation mark, u+201A NEW -->
  <entity name="&amp;ldquo;" char="&#8220;"/>  <!--  left double quotation mark, u+201C ISOnum -->
  <entity name="&amp;rdquo;" char="&#8221;"/>  <!--  right double quotation mark, u+201D ISOnum -->
  <entity name="&amp;bdquo;" char="&#8222;"/>  <!--  double low-9 quotation mark, u+201E NEW -->
  <entity name="&amp;dagger;" char="&#8224;"/>  <!--  dagger, u+2020 ISOpub -->
  <entity name="&amp;Dagger;" char="&#8225;"/>  <!--  double dagger, u+2021 ISOpub -->
  <entity name="&amp;permil;" char="&#8240;"/>  <!--  per mille sign, u+2030 ISOtech -->
  <entity name="&amp;lsaquo;" char="&#8249;"/>  <!--  single left-pointing angle quotation mark, u+2039 ISO proposed -->
  <!-- lsaquo is proposed but not yet ISO standardised -->
  <entity name="&amp;rsaquo;" char="&#8250;"/>  <!--  single right-pointing angle quotation mark, u+203A ISO proposed -->
  <!-- rsaquo is proposed but not yet ISO standardised -->


 <xsl:template match="/">
 <xsl:template match="* | @*">
   <xsl:apply-templates select="* | @*" />
 <xsl:template match="atom:content">
  <xsl:variable name="from" select="($htmlEntities/@name)"/>
  <xsl:variable name="to" select="($htmlEntities/@char)"/>
   <xsl:value-of select="functx:replace-multi(. ,$from,$to)" disable-output-escaping="yes" />

 <xsl:function name="functx:replace-multi" as="xs:string?">
  <xsl:param name="arg" as="xs:string?"/>
  <xsl:param name="changeFrom" as="xs:string*"/>
  <xsl:param name="changeTo" as="xs:string*"/>

  <xsl:sequence select="
     if (count($changeFrom) > 0)
     then functx:replace-multi(
      replace($arg, $changeFrom[1],
      $changeFrom[position() > 1],
      $changeTo[position() > 1])
     else $arg "/>


 <xsl:function name="functx:if-absent" as="item()*">
  <xsl:param name="arg" as="item()*"/>
  <xsl:param name="value" as="item()*"/>

  <xsl:sequence select="
   if (exists($arg))
   then $arg
   else $value


Tuesday, 11 February 2014

Rename MarkLogic Document Function

Simple function to rename a document within a MarkLogic database and retain the collections and permissions:

declare function local:document-rename(
   $old-uri as xs:string, $new-uri as xs:string)
  as empty-sequence()
    let $permissions := xdmp:document-get-permissions($old-uri)
    let $collections := xdmp:document-get-collections($old-uri)
    return xdmp:document-insert(
      $new-uri, doc($old-uri),
      if ($permissions) then $permissions
      else xdmp:default-permissions(),
      if ($collections) then $collections
      else xdmp:default-collections(),
    let $prop-ns := namespace-uri()
    let $properties :=
        [ namespace-uri(.) ne $prop-ns ]
    return xdmp:document-set-properties($new-uri, $properties)