248 lines
11 KiB
Python
248 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
import svgwrite as svgw
|
||
# if you have Python-Package (pip), install the SVG dependancy with
|
||
# pip3 install svgwrite
|
||
|
||
####
|
||
# This script is a "simple" tool for generating pitch accent patern graphs as shown in Dogen's
|
||
# Patreon series (no affliation).
|
||
|
||
class punctCode:
|
||
# Unicode Hiragana Range 3041 -> 309F
|
||
# Small Hirigana 30:41,43,45,47,49,83,85,87,8E
|
||
# Small Ka/Ke 95,96
|
||
# Small Tsu 63
|
||
# Unused Hirigana (we still allow) 30:90,91
|
||
# Unicode Katakana Range (Hirigana + 96d [0x60])
|
||
MODIFY_SET='ゃゅょぁぃぅぇぉ';
|
||
HIRIGANA_MIN = 0x3041;
|
||
HIRIGANA_MAX = 0x309F;
|
||
KATAKANA_OFFSET = 0x60;
|
||
KATAKANA_MIN = HIRIGANA_MIN+KATAKANA_OFFSET;
|
||
KATAKANA_MAX = HIRIGANA_MAX+KATAKANA_OFFSET;
|
||
# TODO: Warn users if they give me Kanji and not kana!
|
||
|
||
def __init__(self, key, mora, tones):
|
||
self.key = key
|
||
self.mora = mora
|
||
self.tones = tones
|
||
def __str__(self):
|
||
sm=''
|
||
for m in self.mora:
|
||
sm=sm+(',%s' % m)
|
||
st=''
|
||
for t in self.tones:
|
||
st=st+(',%d' % t)
|
||
return '<'+sm[1:]+':'+st[1:]+'>'
|
||
def __repr__(self):
|
||
return self.__str__()
|
||
def isSafe(self): # report if length's match
|
||
return len(self.mora) == len(self.tones)
|
||
def warn(s, ind=0):
|
||
print('=> WARNING: %s' % s)
|
||
def toSVG(self, outputFN, \
|
||
dx=50, dy=None, rad=7, stroke=3, circ_stroke=None, \
|
||
color='#000000', padding=25, padding_tb=None, padding_lr=None, \
|
||
offset=20, \
|
||
font_family='Noto Sans', font_height=27, style=''):
|
||
|
||
# Now dow the variable renaming
|
||
# These variables control the core geometry of the image. You shouldn't ever need to change
|
||
# this code to configure your image. Simply use the various optional named arguments in the
|
||
# toSVG method above.
|
||
STEP_HEIGHT = dx
|
||
STEP_WIDTH = STEP_HEIGHT if dy == None else dy
|
||
LINE_STROKE = stroke
|
||
CIRC_STROKE = stroke if circ_stroke == None else circ_stroke
|
||
CIRC_RAD = rad
|
||
PADDING_TB = padding_tb if (padding_tb != None) else padding
|
||
PADDING_LR = padding_lr if (padding_lr != None) else padding
|
||
TEXT_OFFSET = offset
|
||
TEXT_HEIGHT = font_height
|
||
COLOR = color
|
||
|
||
# SVG image placement to move us off of the origin.
|
||
MOVE_DOWN = PADDING_TB + STEP_HEIGHT;
|
||
MOVE_RIGHT = PADDING_LR;
|
||
# never no when you'll need 1/sqrt(2) (0.707). Much more critical than pi.
|
||
ISQRT2 = (2**-0.5)
|
||
|
||
# Generate width and height for SVG. Generally the max function used here won't do anything
|
||
# as the two arguments are the same, but if we screw up this makes sure we can see the full
|
||
# result even in it's deranged form.
|
||
WIDTH = 2*PADDING_LR + STEP_WIDTH*(max(len(self.tones),len(self.mora))-1);
|
||
HEIGHT = 2*PADDING_TB + STEP_HEIGHT + TEXT_OFFSET + TEXT_HEIGHT;
|
||
|
||
# Generate drawing workspace
|
||
dwg = svgw.Drawing(filename=outputFN, size=(WIDTH, HEIGHT))
|
||
# put everything in a group
|
||
master = dwg.add(dwg.g(id='master_group'))
|
||
# use the group to fix the origin placement
|
||
master.translate(MOVE_RIGHT, MOVE_DOWN)
|
||
# put the graph points and lines in one group
|
||
graph = master.add(dwg.g(id='graph'))
|
||
# loop throug each mora point
|
||
for iT,T in enumerate(self.tones):
|
||
# location is one of two set heights, and a step to the right for each word.
|
||
loc = (iT*STEP_WIDTH,-(T%2)*STEP_HEIGHT)
|
||
# only mora that we've coded get filled circles. There are workarounds if you want
|
||
# everything or odd characters to be filled though.
|
||
currentFill = COLOR if (T & 0b10) == 0 else 'none'
|
||
# and put in that damn circle.
|
||
graph.add(dwg.circle(center=loc, r=CIRC_RAD, \
|
||
stroke=COLOR, stroke_width=CIRC_STROKE, fill=currentFill))
|
||
|
||
if not (T & 0b100): # don't bother adding the connection line if bit 3 is set
|
||
# Draw a line between the current and previous point
|
||
startPoint = loc;
|
||
endPoint = pT['loc'];
|
||
if (loc[1] == pT['loc'][1]): # remove radus from each endPoint
|
||
startPoint = (startPoint[0] - CIRC_RAD, startPoint[1])
|
||
endPoint = (endPoint[0] + CIRC_RAD, endPoint[1])
|
||
else: # remove diagonal length
|
||
startPoint = (startPoint[0] - CIRC_RAD*ISQRT2, startPoint[1])
|
||
endPoint = (endPoint[0] + CIRC_RAD*ISQRT2, endPoint[1])
|
||
if (pT['loc'][1] > loc[1]):
|
||
startPoint = (startPoint[0], startPoint[1] + CIRC_RAD*ISQRT2)
|
||
endPoint = (endPoint[0], endPoint[1] - CIRC_RAD*ISQRT2)
|
||
else:
|
||
startPoint = (startPoint[0], startPoint[1] - CIRC_RAD*ISQRT2)
|
||
endPoint = (endPoint[0], endPoint[1] + CIRC_RAD*ISQRT2)
|
||
graph.add(dwg.line(start=startPoint, end=endPoint, \
|
||
stroke_width=LINE_STROKE, stroke=COLOR))
|
||
pT = {'iT':iT, 'T':T, 'loc':loc} # save the previous location so we can compute up/down
|
||
|
||
# now draw each mora (note: きゅ is one not two, while けっ is two not one.)
|
||
# by setting the font size here and the text anchor position we fix the alignment to be the
|
||
# center of the circles in the graph above
|
||
moras = master.add(dwg.g(id='moras',\
|
||
font_size=TEXT_HEIGHT,\
|
||
text_anchor='middle',\
|
||
fill=COLOR,
|
||
font_family=font_family))
|
||
for iM,M in enumerate(self.mora):
|
||
# each section
|
||
text = moras.add(dwg.text(M, insert=(iM*STEP_WIDTH, TEXT_OFFSET+TEXT_HEIGHT), id='moras'))
|
||
text.attribs['style']=('text-align:center;'+style)
|
||
# then try to save the file.
|
||
# !!! No error handling. I wrote it in an afternoon, this isn't production code.
|
||
# TODO: Add error handling.
|
||
dwg.save()
|
||
|
||
# This was supposed to be a class method, but my knowledge of python isn't that strong. It worked
|
||
# until I moved the class to it's own file. So now it's just a function in the same file. I think?
|
||
# TODO: Make sure I'm implementing a class method properly.
|
||
|
||
###
|
||
# The way this method works is it takes in a key (whatever you want) as a file name, a "word"
|
||
# meaning specifically the phonetic characters to describe the word (katakana or hiragana), and
|
||
# a code representing the pitch accent pattern and line connection preferences for the word.
|
||
#
|
||
# The code takes the form of comma seperated groups of downstep location followed by the mora length
|
||
# of the word. For example 勉強=べんきょう would be coded as 0/4. 案内=あんない would be 3/4. A phrase
|
||
# like 涙を拭く=なみだをふく would be 1/3,0/2 Note that the を is implicitly contained in the prior
|
||
# word. If you wanted to code 涙拭く without the を, you would need to mark the final symbol as being
|
||
# dropped by using a star. So the code for なみだふく is 1/3*,0/2. If you want to include
|
||
# the final mora following the word even without providing a symbol, set the includeFinalSymbol
|
||
# named argument in this function to True. This allows for odaka words to include a drop without
|
||
# a specific kana being inserted.
|
||
#
|
||
# Finally, if you wanted to drop the connecting line for some reason, then put a star prior to the
|
||
# downstep location. For example, I pulled the thumbnail from Dogen's 25th video with a split
|
||
# between べんきょう and して. To code this split use 0/4*,*0/2. This codes two heiban words, the
|
||
# first without a spot for the implied particle (as indicated by a trailing star), and a second word
|
||
# lacking a connection to the prior word (as indicated by a leading star).
|
||
def parseToneString(name,word,codes, includeFinalOpen=False):
|
||
moraList=[];
|
||
# First break up the word by characters
|
||
for indLetter,letter in enumerate(word):
|
||
primaryInsert = True;
|
||
codepoint = ord(letter);
|
||
# Then check for small characters in either Kana range
|
||
if (codepoint <= punctCode.KATAKANA_MAX and codepoint >= punctCode.KATAKANA_MIN):
|
||
codepoint = codepoint - punctCode.KATAKANA_OFFSET;
|
||
for modify in punctCode.MODIFY_SET:
|
||
if (codepoint == ord(modify)):
|
||
primaryInsert = False
|
||
break;
|
||
|
||
# TODO: Check for Kanji, punctuation, etc, and handle gracefully
|
||
|
||
# If the character is a "primary" character (i.e. a big Mora or a small っ)
|
||
# Then we will insert it directly.
|
||
if (primaryInsert or len(moraList) == 0):
|
||
moraList.append(letter);
|
||
# Otherwise we try to lump the special character with the previous character.
|
||
# The exception being if we started a word with it (why the hell would we do that anyway?)
|
||
else:
|
||
prevLetter = moraList.pop()
|
||
moraList.append(prevLetter+letter)
|
||
|
||
# Merge small mora into individual mora
|
||
# exempting the small 'っ'
|
||
### At this point moraList contains the word split by mora groups
|
||
|
||
### Now to assign high/low and filled data directly
|
||
# So the code is designed to corespond to the number pattern used in the Apple Dictionary and
|
||
# 三省堂スーパー大辞林. The first number indicates the downstep location followed by a slash
|
||
# telling this dumb parser where the end of the word is. We will insert a symbol for each mora
|
||
# specified by this code, with one EXTRA dot following the word representing the downstep
|
||
# location if a particle is attached. We ASSUME the next character is a particle, unless
|
||
# incidated otherwise. So a code of 1/2,1/2 would assume that we have 5 or 6 mora. 2 for the
|
||
# first wird, a particle, two for the second word, and an optional ending partile. If none is
|
||
# provided we truncate the drawing unless the downstep would occor on the non-provided particle.
|
||
# TODO: Provide a way to omit downstep in compound entries
|
||
|
||
# The way we re-encode this information is with a 4 state (2-bit) code:
|
||
# 00 lo filled
|
||
# 01 hi filled
|
||
# 10 lo empty
|
||
# 11 hi empty
|
||
# a leading 3rd bit is used to indicate skipped connecting lines
|
||
|
||
n=codes.split(',')
|
||
n_code=[];
|
||
for iCodeEntry,codeEntry in enumerate(n):
|
||
# TODO: Handle special codes for odd corner cases
|
||
# now split by slash
|
||
ceSplit = codeEntry.split('/');
|
||
dontDrawLine = 0b100 if (ceSplit[0][0] == '*' or iCodeEntry == 0) else 0;
|
||
includeParticle = not (ceSplit[1][-1] == '*'); # draw the particle unless we say skip it
|
||
downStep = int(ceSplit[0] if (ceSplit[0][0] != '*') else ceSplit[0][1:])
|
||
wordLength = int(ceSplit[1] if (includeParticle) else ceSplit[1][:-1]);
|
||
if (downStep == 0):
|
||
n_code.append(0b00 + dontDrawLine)
|
||
for x in range(wordLength-1):
|
||
n_code.append(0b01)
|
||
if (includeParticle):
|
||
n_code.append(0b11)
|
||
else:
|
||
if (downStep != 1):
|
||
n_code.append(0b00 + dontDrawLine)
|
||
else:
|
||
n_code.append(0b01 + dontDrawLine)
|
||
|
||
for x in range(downStep-1):
|
||
n_code.append(0b01)
|
||
for x in range(wordLength-downStep):
|
||
n_code.append(0b00)
|
||
if (includeParticle):
|
||
n_code.append(0b10)
|
||
|
||
# we can get a mismatch if we have a word with a hanging accent point for a particle
|
||
# that doesn't exist. If that's the case, then we will truncate the tone by default,
|
||
# otherwise we can force the insertion of a filler symbol. If the final point is NOT for
|
||
# an unused particle, we simply generate a mismatched graph, and warn the user. The mismatch
|
||
# in this case should be from a problem with the input strings.
|
||
if (len(moraList) == len(n_code)-1):
|
||
if (includeFinalOpen):
|
||
moraList.append('○')
|
||
elif ((not includeFinalOpen) and (n_code[-1] & 0b10) != 0):
|
||
n_code.pop()
|
||
|
||
pc = punctCode(name, moraList, n_code)
|
||
if (not pc.isSafe()):
|
||
punctCode.warn('Count mismatch for \'%s\' (mora: %d, code: %d)' % \
|
||
(name, len(pc.mora), len(pc.tones)))
|
||
return pc
|
||
|