I'm trying to generate all unicode 16.0 characters on a file and all unicode 15.1 characters on a other file and display on a new file the added characters on unicode 16.0.
I tried this code, but this is not what im looking for, because there may be new emojis or others characters that may not be printable on unicode 15.1 but is printable on unicode 16.0 and i dont think ive generated characters correctly. Please take a look on the source code, thank you.
import os
file_15_1 = "unicode_15_1.txt"
file_16_0 = "unicode_16_0.txt"
file_new_in_16_0 = "new_in_16_0.txt"
unicode_15_1_end = 149813
unicode_16_0_end = 154998
def is_visible(char):
return char.isprintable() and not char.isspace() and char != ""
def generate_unicode_file(start, end, filename):
with open(filename, "w", encoding="utf-8") as f:
for codepoint in range(start, end + 1):
try:
f.write(chr(codepoint) + "\n")
except ValueError:
continue
generate_unicode_file(0, unicode_15_1_end, file_15_1)
generate_unicode_file(0, unicode_16_0_end, file_16_0)
def find_new_characters(file1, file2, output_file):
with open(file1, "r", encoding="utf-8") as f1, open(file2, "r", encoding="utf-8") as f2:
chars_15_1 = set(f1.read().splitlines())
chars_16_0 = set(f2.read().splitlines())
new_in_16_0 = chars_16_0 - chars_15_1 # No
with open(output_file, "w", encoding="utf-8") as f_out:
for char in sorted(new_in_16_0):
if is_visible(char):
f_out.write(char + "\n")
find_new_characters(file_15_1, file_16_0, file_new_in_16_0)
print(f"- {file_15_1}")
print(f"- {file_16_0}")
print(f"- {file_new_in_16_0}")```
I'm trying to generate all unicode 16.0 characters on a file and all unicode 15.1 characters on a other file and display on a new file the added characters on unicode 16.0.
I tried this code, but this is not what im looking for, because there may be new emojis or others characters that may not be printable on unicode 15.1 but is printable on unicode 16.0 and i dont think ive generated characters correctly. Please take a look on the source code, thank you.
import os
file_15_1 = "unicode_15_1.txt"
file_16_0 = "unicode_16_0.txt"
file_new_in_16_0 = "new_in_16_0.txt"
unicode_15_1_end = 149813
unicode_16_0_end = 154998
def is_visible(char):
return char.isprintable() and not char.isspace() and char != ""
def generate_unicode_file(start, end, filename):
with open(filename, "w", encoding="utf-8") as f:
for codepoint in range(start, end + 1):
try:
f.write(chr(codepoint) + "\n")
except ValueError:
continue
generate_unicode_file(0, unicode_15_1_end, file_15_1)
generate_unicode_file(0, unicode_16_0_end, file_16_0)
def find_new_characters(file1, file2, output_file):
with open(file1, "r", encoding="utf-8") as f1, open(file2, "r", encoding="utf-8") as f2:
chars_15_1 = set(f1.read().splitlines())
chars_16_0 = set(f2.read().splitlines())
new_in_16_0 = chars_16_0 - chars_15_1 # No
with open(output_file, "w", encoding="utf-8") as f_out:
for char in sorted(new_in_16_0):
if is_visible(char):
f_out.write(char + "\n")
find_new_characters(file_15_1, file_16_0, file_new_in_16_0)
print(f"- {file_15_1}")
print(f"- {file_16_0}")
print(f"- {file_new_in_16_0}")```
Although the current Python 3.13 supports Unicode 15.1.0 in its unicodedata module and can identify the supported code points, that won't help you with Unicode 16.0.0. If you download the UnicodeData.txt files for each version (15.1.0, 16.0.0) you can parse them yourself for the supported characters and write them to a file; although, without a font supporting Unicode 16.0.0 you won't see much. UnicodeData.html describes the data format.
Here's an example that uses the csv module to parse the semicolon-delimited data files.
import csv
def print_file(filename, data):
with open(filename, 'w', encoding='utf-8-sig') as file:
for key, value in data.items():
code = int(key, 16)
if 0xD800 <= code <= 0xDFFF:
continue # ignore surrogates...can't be written individually
name = value[0]
print(f'{chr(code)} U+{key} {name}', file=file)
with open('Downloads/UnicodeData15.1.0.txt', encoding='ascii', newline='') as file:
reader = csv.reader(file, delimiter=';')
data15 = list(reader)
with open('Downloads/UnicodeData16.0.0.txt', encoding='ascii', newline='') as file:
reader = csv.reader(file, delimiter=';')
data16 = list(reader)
dict15 = {row[0] : row[1:] for row in data15}
dict16 = {row[0] : row[1:] for row in data16}
diff = {key : value for key, value in dict16.items() if key not in dict15}
print(f'Code points in Unicode 15.1.0: {len(dict15)}')
print(f'Code points in Unicode 16.0.0: {len(dict16)}')
print(f'New code points in Unicode 16.0.0: {len(diff)}')
print_file('unicode_15_1.txt', d15)
print_file('unicode_16_0.txt', d15)
print_file('new_in_16_0.txt', diff)
Output (along with three files)
Code points in Unicode 15.1.0: 34931
Code points in Unicode 16.0.0: 40116
New code points in Unicode 16.0.0: 5185
Example of new_in_16_0.txt
. Glyph display depends on font support. I could see the last five correctly on Windows 11 and Chrome Version 131.0.6778.265 (Official Build) (64-bit):
U+0897 ARABIC PEPET
U+1B4E BALINESE INVERTED CARIK SIKI
U+1B4F BALINESE INVERTED CARIK PAREREN
U+1B7F BALINESE PANTI BAWAK
U+1C89 CYRILLIC CAPITAL LETTER TJE
...
chr(codepoint)
andstr.isprintable()
are defined for (up to)unicodedata.unidata_version
which is15.1.0
for myPython 3.13.1
… However, you could compare fileshttps://www.unicode.org/Public/15.1.0/ucd/UnicodeData.txt
andhttps://www.unicode.org/Public/16.0.0/ucd/UnicodeData.txt
… – JosefZ Commented Jan 21 at 17:31