Module:Unicode data

Edit links

Topic	Function	Parameter type (string=by character(s); c.p. by0xHex value)	Example	Returns	Character
Unicode character name	`\|lookup\|name`	code point	`{{#invoke:Unicode data\|lookup\|name\|0xA9}}` `{{#invoke:Unicode data\|lookup\|name\|0x0007}}`	COPYRIGHT SIGN <control-0007>	©
Scripts	`\|lookup\|script`	code point	`{{#invoke:Unicode data\|lookup\|script\|A061}}`	Yiii	ꁡ
Blocks	`\|lookup\|block`	code point	`{{#invoke:Unicode data\|lookup\|block\|A061}}`	Yi Syllables	ꁡ
Planes	`\|lookup\|plane`	code point	`{{#invoke:Unicode data\|lookup\|plane\|0xA9}}` `{{#invoke:Unicode data\|lookup\|plane\|0x1F608}}`	Basic Multilingual Plane Supplementary Multilingual Plane	© 😈
General Category	`\|lookup\|category`	code point	`{{#invoke:Unicode data\|lookup\|category\|0xA9}}` `{{#invoke:Unicode data\|lookup\|category\|0x002B}}`	So Sm	© +
Controls	`\|is\|control`	code point	`{{#invoke:Unicode data\|lookup\|control\|A9}}` `{{#invoke:Unicode data\|lookup\|control\|FFFF}}`	assigned unassigned	©
Unihan properties	`\|lookup\|kCantonese`	code point	`{{#invoke:Unicode data\|lookup\|kCantonese\|6e2f}}`	gong2	港
Latin script	`\|is\|Latin`	string	`{{#invoke:Unicode data\|is\|Latin\|abcŁíā̀}}` `{{#invoke:Unicode data\|is\|Latin\|abc文xyz}}`	true false
WP:Article title(WP:NCTR)	`\|is\|valid_pagename`	string	`{{#invoke:Unicode data\|is\|valid_pagename\|Main_page}}` `{{#invoke:Unicode data\|is\|valid_pagename\|# (disambiguation)}}`	true false
Bidirectionality, right-to-left scripts	`\|is\|rtl`	string	`{{#invoke:Unicode data\|is\|rtl\|ش}}` `{{#invoke:Unicode data\|is\|rtl\|34}}`	true false	ش 4
Combining character	`\|is\|combining`	code point	`{{#invoke:Unicode data\|is\|combining\|0300}}` `{{#invoke:Unicode data\|is\|combining\|64}}`	true false	̀ d
Character assignation	`\|is\|assigned`	code point	`{{#invoke:Unicode data\|is\|assigned\|A061}}` `{{#invoke:Unicode data\|is\|assigned\|FFEF}}`	true false	ꁡ ;
Printable	`\|is\|printable`	code point	`{{#invoke:Unicode data\|is\|printable\|0061}}` `{{#invoke:Unicode data\|is\|printable\|0007}}` `{{#invoke:Unicode data\|is\|printable\|FFFF}}`	>true< >false< >false<	>a< >< ><
Whitespace character § Unicode	`\|is\|whitespace`	code point	`{{#invoke:Unicode data\|is\|whitespace\|0x20}}` `{{#invoke:Unicode data\|is\|whitespace\|0xA0}}` `{{#invoke:Unicode data\|is\|whitespace\|0x64}}`	>true< >true< NBSP >false<	> < > < >d<
Alias names	`\|aliases`		[application unknown]		&#x; &#x;
Combining class	`\|`		[application unknown]		&#x; &#x;
Age	`\|`		[application unknown]
get_best_script	`\|get_best_script`		[application unknown]

localp={}localfloor=math.floorlocalfunctionerrorf(level,...)iftype(level)=="number"thenreturnerror(string.format(...),level+1)else-- level is actually the format string.returnerror(string.format(level,...),2)endendlocalfunctionbinary_range_search(codepoint,ranges)locallow,mid,highlow,high=1,ranges.lengthorrequire"Module:TableTools".length(ranges)whilelow<=highdomid=floor((low+high)/2)localrange=ranges[mid]ifcodepoint<range[1]thenhigh=mid-1elseifcodepoint<=range[2]thenreturnrange,midelselow=mid+1endendreturnnil,midendp.binary_range_search=binary_range_search--[[local function linear_range_search(codepoint, ranges)for i, range in ipairs(ranges) doif range[1] <= codepoint and codepoint <= range[2] thenreturn rangeendendend--]]-- Load a module by indexing "loader" with the name of the module minus the-- "Module:Unicode data/" part. For instance, loader.blocks returns-- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be-- returned.localloader=setmetatable({},{__index=function(self,key)localsuccess,data=pcall(mw.loadData,"Module:Unicode data/"..key)ifnotsuccessthendata=falseendself[key]=datareturndataend})-- For the algorithm used to generate Hangul Syllable names,-- see "Hangul Syllable Name Generation" in section 3.12 of the-- Unicode Specification:-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf-- binary_range_search assumes these are ordered by codepoint. Do not place them in a random order!localname_hooks={{0x00,0x1F,"<control-%04X>"},-- C0 control characters{0x7F,0x9F,"<control-%04X>"},-- DEL and C1 control characters{0x3400,0x4DBF,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph Extension A{0x4E00,0x9FFF,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph{0xAC00,0xD7A3,function(codepoint)-- Hangul SyllableslocalHangul_data=loader.Hangullocalsyllable_index=codepoint-0xAC00return("HANGUL SYLLABLE %s%s%s"):format(Hangul_data.leads[floor(syllable_index/Hangul_data.final_count)],Hangul_data.vowels[floor((syllable_index%Hangul_data.final_count)/Hangul_data.trail_count)],Hangul_data.trails[syllable_index%Hangul_data.trail_count])end},-- High Surrogates, High Private Use Surrogates, Low Surrogates{0xD800,0xDFFF,"<surrogate-%04X>"},{0xE000,0xF8FF,"<private-use-%04X>"},-- Private Use-- CJK Compatibility Ideographs{0xF900,0xFA6D,"CJK COMPATIBILITY IDEOGRAPH-%04X"},{0xFA70,0xFAD9,"CJK COMPATIBILITY IDEOGRAPH-%04X"},{0xFE00,0xFE0F,function(codepoint)-- Variation Selectorsreturn("VARIATION SELECTOR-%d"):format(codepoint-0xFE00+1)end},{0x13460,0x143FA,"EGYPTIAN HIEROGLYPH-%04X"},-- Egyptian Hieroglyphs Extended-A{0x17000,0x187FF,"TANGUT IDEOGRAPH-%04X"},-- Tangut Ideograph{0x18800,0x18AFF,function(codepoint)return("TANGUT COMPONENT-%03d"):format(codepoint-0x187FF)end},{0x18B00,0x18CD5,"KHITAN SMALL SCRIPT CHARACTER-%04X"},-- Khitan Small Script{0x18CFF,0x18CFF,"KHITAN SMALL SCRIPT CHARACTER-%04X"},-- Khitan Small Script{0x18D00,0x18D1E,"TANGUT IDEOGRAPH-%04X"},-- Tangut Ideograph Supplement{0x18D80,0x18DF2,function(codepoint)return("TANGUT COMPONENT-%03d"):format(codepoint-0x18A7F)end},{0x1B170,0x1B2FB,"NUSHU CHARACTER-%04X"},-- Nushu{0x20000,0x2A6DF,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph Extension B{0x2A700,0x2B81D,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph Extension C, D{0x2B820,0x2CEAD,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph Extension E{0x2CEB0,0x2EBE0,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph Extension F{0x2EBF0,0x2EE5D,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph Extension I{0x2F800,0x2FA1D,"CJK COMPATIBILITY IDEOGRAPH-%04X"},-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane){0x30000,0x3134A,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph Extension G{0x31350,0x33479,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph Extension H, J{0xE0100,0xE01EF,function(codepoint)-- Variation Selectors Supplementreturn("VARIATION SELECTOR-%d"):format(codepoint-0xE0100+17)end},{0xF0000,0xFFFFD,"<private-use-%04X>"},-- Plane 15 Private Use{0x100000,0x10FFFD,"<private-use-%04X>"}-- Plane 16 Private Use}name_hooks.length=#name_hookslocalname_range_cachelocalfunctiongenerate_name(data,codepoint)iftype(data)=="string"thenreturndata:format(codepoint)elsereturndata(codepoint)endend--[[-- Checks that the code point is a number and in range.-- Does not check whether code point is an integer.-- Not usedlocal function check_codepoint(funcName, argIdx, val)require 'libraryUtil'.checkType(funcName, argIdx, val, 'number')if codepoint < 0 or 0x10FFFF < codepoint thenerrorf("Codepoint %04X out of range", codepoint)endend--]]functionp.is_noncharacter(codepoint)-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned-- (Cn) and specifically noncharacters:-- https://www.unicode.org/faq/private_use.html#nonchar4return0xFDD0<=codepointand(codepoint<=0xFDEForfloor(codepoint%0x10000)>=0xFFFE)end-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8functionp.lookup_name(codepoint)ifp.is_noncharacter(codepoint)thenreturn("<noncharacter-%04X>"):format(codepoint)endifname_range_cache-- Check if previously used "name hook" applies to this code point.andcodepoint>=name_range_cache[1]andcodepoint<=name_range_cache[2]thenreturngenerate_name(name_range_cache[3],codepoint)endlocalrange=binary_range_search(codepoint,name_hooks)ifrangethenname_range_cache=rangereturngenerate_name(range[3],codepoint)endlocaldata=loader[('names/%03X'):format(codepoint/0x1000)]ifdataanddata[codepoint]thenreturndata[codepoint]-- Unassigned (Cn) consists of noncharacters and reserved characters.-- The character has been established not to be a noncharacter,-- and if it were assigned, its name would already been retrieved,-- so it must be reserved.elsereturn("<reserved-%04X>"):format(codepoint)endendfunctionp.lookup_image(codepoint)localdata=loader[('images/%03X'):format(codepoint/0x1000)]ifdatathenreturndata[codepoint]endendlocalplanes={[0]="Basic Multilingual Plane";[1]="Supplementary Multilingual Plane";[2]="Supplementary Ideographic Plane";[3]="Tertiary Ideographic Plane";[14]="Supplementary Special-purpose Plane";[15]="Supplementary Private Use Area-A";[16]="Supplementary Private Use Area-B";}-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable.localblockslocalfunctionblock_iter(blocks,i)i=i+1localdata=blocks[i]ifdatathen-- Unpack doesn't work on tables loaded with mw.loadData.returni,data[1],data[2],data[3]endend-- An ipairs-type iterator generator for the list of blocks.functionp.enum_blocks()localblocks=loader.blocksreturnblock_iter,blocks,0endfunctionp.lookup_plane(codepoint)locali=floor(codepoint/0x10000)returnplanes[i]or("Plane %u"):format(i)endfunctionp.lookup_block(codepoint)localblocks=loader.blockslocalrange=binary_range_search(codepoint,blocks)ifrangethenreturnrange[3]elsereturn"No Block"endendfunctionp.get_block_info(name)fori,blockinipairs(loader.blocks)doifblock[3]==namethenreturnblockendendendfunctionp.is_valid_pagename(pagename)localhas_nonws=falseforcpinmw.ustring.gcodepoint(pagename)doif(cp==0x0023)-- #or(cp==0x005B)-- [or(cp==0x005D)-- ]or(cp==0x007B)-- {or(cp==0x007C)-- |or(cp==0x007D)-- }or(cp==0x180E)-- MONGOLIAN VOWEL SEPARATORor((cp>=0x2000)and(cp<=0x200A))-- spaces in General Punctuation blockor(cp==0xFFFD)-- REPLACEMENT CHARACTERthenreturnfalseendlocalprintable,result=p.is_printable(cp)ifnotprintablethenreturnfalseendifresult~="space-separator"thenhas_nonws=trueendendreturnhas_nonwsendlocalfunctionmanual_unpack(what,from)ifwhat[from+1]==nilthenreturnwhat[from]endlocalresult={}from=fromor1fori,iteminipairs(what)doifi>=fromthentable.insert(result,item)endendreturnunpack(result)endlocalfunctioncompare_ranges(range1,range2)returnrange1[1]<range2[1]end-- Creates a function to look up data in a module that contains "singles" (a-- code point-to-data map) and "ranges" (an array containing arrays that contain-- the low and high code points of a range and the data associated with that-- range).-- "loader" loads and returns the "singles" and "ranges" tables.-- "match_func" is passed the code point and either the data or the "dots", and-- generates the final result of the function.-- The varargs ("dots") describes the default data to be returned if there wasn't-- a match.-- In case the function is used more than once, "cache" saves ranges that have-- already been found to match, or a range whose data is the default if there-- was no match.localfunctionmemo_lookup(data_module_subpage,match_func,...)localdots={...}localcache={}localsingles,rangesreturnfunction(codepoint)ifnotsinglesthenlocaldata_module=loader[data_module_subpage]singles,ranges=data_module.singles,data_module.rangesendifsingles[codepoint]thenreturnmatch_func(codepoint,singles[codepoint])endlocalrange=binary_range_search(codepoint,cache)ifrangethenreturnmatch_func(codepoint,manual_unpack(range,3))endlocalrange,index=binary_range_search(codepoint,ranges)ifrangethentable.insert(cache,range)table.sort(cache,compare_ranges)returnmatch_func(codepoint,manual_unpack(range,3))endifranges[index]thenlocaldots_rangeifcodepoint>ranges[index][2]thendots_range={ranges[index][2]+1,ranges[index+1]andranges[index+1][1]-1or0x10FFFF,unpack(dots)}else-- codepoint < range[index][1]dots_range={ranges[index-1]andranges[index-1][2]+1or0,ranges[index][1]-1,unpack(dots)}endtable.sort(cache,compare_ranges)endreturnmatch_func(codepoint)endend-- Get a code point's combining class value in [[Module:Unicode data/combining]],-- and return whether this value is not zero. Zero is assigned as the default-- if the combining class value is not found in this data module.-- That is, return true if character is combining, or false if it is not.-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for-- more information.p.is_combining=memo_lookup("combining",function(codepoint,combining_class)returncombining_classandcombining_class~=0orfalseend,0)functionp.add_dotted_circle(str)return(mw.ustring.gsub(str,".",function(char)ifp.is_combining(mw.ustring.codepoint(char))thenreturn'◌'..charendend))endlocallookup_control=memo_lookup("control",function(codepoint,ccc)returncccor"assigned"end,"assigned")p.lookup_control=lookup_controlfunctionp.is_assigned(codepoint)returnlookup_control(codepoint)~="unassigned"endfunctionp.is_printable(codepoint)localresult=lookup_control(codepoint)return(result=="assigned")or(result=="space-separator"),resultendfunctionp.is_whitespace(codepoint)localresult=lookup_control(codepoint)return(result=="space-separator"),resultendp.lookup_category=memo_lookup("category",function(codepoint,category)returncategoryend,"Cn")locallookup_script=memo_lookup("scripts",function(codepoint,script_code)returnscript_codeor'Zzzz'end,"Zzzz")p.lookup_script=lookup_scriptfunctionp.get_best_script(str)-- Check type of argument, because mw.text.decode coerces numbers to strings!require"libraryUtil".checkType("get_best_script",1,str,"string")-- Convert HTML character references (including named character references,-- or character entities) to characters.str=mw.text.decode(str,true)localscripts={}forcodepointinmw.ustring.gcodepoint(str)dolocalscript=lookup_script(codepoint)-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts.ifnot(script=="Zyyy"orscript=="Zinh"orscript=="Zzzz")thenscripts[script]=trueendend-- If scripts does not contain two or more keys,-- return first and only key (script code) in table.ifnotnext(scripts,next(scripts))thenreturnnext(scripts)end-- else return majority script, or else "Zzzz"?endfunctionp.is_Latin(str)require"libraryUtil".checkType("get_best_script",1,str,"string")str=mw.text.decode(str,true)-- Search for the leading bytes that introduce the UTF-8 encoding of the-- code points U+0340-U+10FFFF. If they are not found and there is at least-- one Latin-script character, the string counts as Latin, because the rest-- of the characters can only be Zyyy, Zinh, and Zzzz.-- The only scripts found below U+0370 (the first code point of the Greek-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz.-- See the codepage in the [[UTF-8]] article.ifnotstr:find"[\205-\244]"thenforcodepointinmw.ustring.gcodepoint(str)doiflookup_script(codepoint)=="Latn"thenreturntrueendendendlocalLatn=falselocali=0;-- indexer for use in error messagesforcodepointinmw.ustring.gcodepoint(str)doi=i+1;-- bump the indexerlocalscript=lookup_script(codepoint)ifscript=="Latn"thenLatn=trueelseifnot(script=="Zyyy"orscript=="Zinh"orscript=="Zzzz")thenreturnfalse,i-- abandon as not Latn; identify the offending character's positionendendreturnLatn,(notLatnandi)ornil-- when <Latn> false, return offending charactor's position as second return value; nil elseend-- Checks that a string contains only characters belonging to right-to-left-- scripts, or characters of ignorable scripts.functionp.is_rtl(str)require"libraryUtil".checkType("get_best_script",1,str,"string")str=mw.text.decode(str,true)-- Search for the leading bytes that introduce the UTF-8 encoding of the-- code points U+0580-U+10FFFF. If they are not found, the string can only-- have characters from a left-to-right script, because the first code point-- in a right-to-left script is U+0591, in the Hebrew block.ifnotstr:find"[\214-\244]"thenreturnfalseendlocalresult=falselocalrtl=loader.scripts.rtlforcodepointinmw.ustring.gcodepoint(str)dolocalscript=lookup_script(codepoint)ifrtl[script]thenresult=trueelseifnot(script=="Zyyy"orscript=="Zinh"orscript=="Zzzz")thenreturnfalseendendreturnresultend--[[--------------------------< I S _ R T L _ F R A M E >------------------------------------------------------external entry from an {{#invoke:}} to determine if a string of text is rtl. Strips html and html-like tags sothat those tags don't corrupt the is-rtl-is-not-rtl determination; this added for the cases where the rtl texthas <br /> tags.]]functionp.is_rtl_frame(frame)localstr=frame.args[1];-- get the string from the {{#invoke:}} framestr=str:gsub('%b<>','');-- strip any html and html-like tagsreturnp.is_rtl(str);-- return if whatever remains rtl; false elseendlocalfunctionget_codepoint(args,arg)localcodepoint_string=args[arg]orerrorf(2,"Parameter %s is required",tostring(arg))localcodepoint=tonumber(codepoint_string,16)orerrorf(2,"Parameter %s is not a code point in hexadecimal base",tostring(arg))ifnot(0<=codepointandcodepoint<=0x10FFFF)thenerrorf(2,"code point in parameter %s out of range",tostring(arg))endreturncodepointendlocalfunctionget_func(args,arg,prefix)localsuffix=args[arg]orerrorf(2,"Parameter %s is required",tostring(arg))suffix=mw.text.trim(suffix)localfunc_name=prefix..suffixlocalfunc=p[func_name]orerrorf(2,"There is no function '%s'",func_name)returnfuncend-- This function allows any of the "lookup" functions to be invoked. The first-- parameter is the word after "lookup_"; the second parameter is the code point-- in hexadecimal base.functionp.lookup(frame)localfunc=get_func(frame.args,1,"lookup_")localcodepoint=get_codepoint(frame.args,2)localresult=func(codepoint)iffunc==p.lookup_namethen-- Prevent code point labels such as <control-0000> from being-- interpreted as HTML tags.result=result:gsub("<","<")endreturnresultendfunctionp.is(frame)localfunc=get_func(frame.args,1,"is_")-- is_Latin and is_valid_pagename take strings.iffunc==p.is_Latinorfunc==p.is_valid_pagenameorfunc==p.is_rtlthenreturn(func(frame.args[2]))else-- The rest take code points.localcodepoint=get_codepoint(frame.args,2)return(func(codepoint))-- Adjust to one result.endendfunctionp.lookup_kCantonese(codepoint)localdata=loader[('Unihan/kCantonese/%02X'):format(floor(codepoint/0x1000))]ifdatathenreturndata[codepoint]endendreturnp

	0	1	2	3	4	6	8	A	B	C	D	E	F
00x	U+0000– U+0FFF	U+1000– U+1FFF	U+2000– U+2FFF	U+3000– U+3FFF	U+4000– U+4FFF			U+A000– U+AFFF			U+D000– U+DFFF		U+F000– U+FFFF
01x	U+10000– U+10FFF	U+11000– U+11FFF	U+12000– U+12FFF	U+13000– U+13FFF	U+14000– U+14FFF	U+16000– U+16FFF	U+18000– U+18FFF	U+1A000– U+1AFFF	U+1B000– U+1BFFF	U+1C000– U+1CFFF	U+1D000– U+1DFFF	U+1E000– U+1EFFF	U+1F000– U+1FFFF
0Ex	U+E0000– U+E0FFF

Movatterモバイル変換

Module:Unicode data

Usage

Parameters and functions

code point

"lookup" and "is" functions

Functions overview

Data modules

Copyright

Known issues

See also