This repository has been archived on 2023-06-17. You can view files and clone it, but you cannot make any changes to it's state, such as pushing and creating new issues, pull requests or comments.
jaPitchPlotter/punctCode.py
Luke 90368c778a Inital commit
This commit contains some basic examples, and the core tool.
2017-08-20 17:33:19 -07:00

248 lines
11 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
import svgwrite as svgw
# if you have Python-Package (pip), install the SVG dependancy with
# pip3 install svgwrite
####
# This script is a "simple" tool for generating pitch accent patern graphs as shown in Dogen's
# Patreon series (no affliation).
class punctCode:
# Unicode Hiragana Range 3041 -> 309F
# Small Hirigana 30:41,43,45,47,49,83,85,87,8E
# Small Ka/Ke 95,96
# Small Tsu 63
# Unused Hirigana (we still allow) 30:90,91
# Unicode Katakana Range (Hirigana + 96d [0x60])
MODIFY_SET='ゃゅょぁぃぅぇぉ';
HIRIGANA_MIN = 0x3041;
HIRIGANA_MAX = 0x309F;
KATAKANA_OFFSET = 0x60;
KATAKANA_MIN = HIRIGANA_MIN+KATAKANA_OFFSET;
KATAKANA_MAX = HIRIGANA_MAX+KATAKANA_OFFSET;
# TODO: Warn users if they give me Kanji and not kana!
def __init__(self, key, mora, tones):
self.key = key
self.mora = mora
self.tones = tones
def __str__(self):
sm=''
for m in self.mora:
sm=sm+(',%s' % m)
st=''
for t in self.tones:
st=st+(',%d' % t)
return '<'+sm[1:]+':'+st[1:]+'>'
def __repr__(self):
return self.__str__()
def isSafe(self): # report if length's match
return len(self.mora) == len(self.tones)
def warn(s, ind=0):
print('=> WARNING: %s' % s)
def toSVG(self, outputFN, \
dx=50, dy=None, rad=7, stroke=3, circ_stroke=None, \
color='#000000', padding=25, padding_tb=None, padding_lr=None, \
offset=20, \
font_family='Noto Sans', font_height=27, style=''):
# Now dow the variable renaming
# These variables control the core geometry of the image. You shouldn't ever need to change
# this code to configure your image. Simply use the various optional named arguments in the
# toSVG method above.
STEP_HEIGHT = dx
STEP_WIDTH = STEP_HEIGHT if dy == None else dy
LINE_STROKE = stroke
CIRC_STROKE = stroke if circ_stroke == None else circ_stroke
CIRC_RAD = rad
PADDING_TB = padding_tb if (padding_tb != None) else padding
PADDING_LR = padding_lr if (padding_lr != None) else padding
TEXT_OFFSET = offset
TEXT_HEIGHT = font_height
COLOR = color
# SVG image placement to move us off of the origin.
MOVE_DOWN = PADDING_TB + STEP_HEIGHT;
MOVE_RIGHT = PADDING_LR;
# never no when you'll need 1/sqrt(2) (0.707). Much more critical than pi.
ISQRT2 = (2**-0.5)
# Generate width and height for SVG. Generally the max function used here won't do anything
# as the two arguments are the same, but if we screw up this makes sure we can see the full
# result even in it's deranged form.
WIDTH = 2*PADDING_LR + STEP_WIDTH*(max(len(self.tones),len(self.mora))-1);
HEIGHT = 2*PADDING_TB + STEP_HEIGHT + TEXT_OFFSET + TEXT_HEIGHT;
# Generate drawing workspace
dwg = svgw.Drawing(filename=outputFN, size=(WIDTH, HEIGHT))
# put everything in a group
master = dwg.add(dwg.g(id='master_group'))
# use the group to fix the origin placement
master.translate(MOVE_RIGHT, MOVE_DOWN)
# put the graph points and lines in one group
graph = master.add(dwg.g(id='graph'))
# loop throug each mora point
for iT,T in enumerate(self.tones):
# location is one of two set heights, and a step to the right for each word.
loc = (iT*STEP_WIDTH,-(T%2)*STEP_HEIGHT)
# only mora that we've coded get filled circles. There are workarounds if you want
# everything or odd characters to be filled though.
currentFill = COLOR if (T & 0b10) == 0 else 'none'
# and put in that damn circle.
graph.add(dwg.circle(center=loc, r=CIRC_RAD, \
stroke=COLOR, stroke_width=CIRC_STROKE, fill=currentFill))
if not (T & 0b100): # don't bother adding the connection line if bit 3 is set
# Draw a line between the current and previous point
startPoint = loc;
endPoint = pT['loc'];
if (loc[1] == pT['loc'][1]): # remove radus from each endPoint
startPoint = (startPoint[0] - CIRC_RAD, startPoint[1])
endPoint = (endPoint[0] + CIRC_RAD, endPoint[1])
else: # remove diagonal length
startPoint = (startPoint[0] - CIRC_RAD*ISQRT2, startPoint[1])
endPoint = (endPoint[0] + CIRC_RAD*ISQRT2, endPoint[1])
if (pT['loc'][1] > loc[1]):
startPoint = (startPoint[0], startPoint[1] + CIRC_RAD*ISQRT2)
endPoint = (endPoint[0], endPoint[1] - CIRC_RAD*ISQRT2)
else:
startPoint = (startPoint[0], startPoint[1] - CIRC_RAD*ISQRT2)
endPoint = (endPoint[0], endPoint[1] + CIRC_RAD*ISQRT2)
graph.add(dwg.line(start=startPoint, end=endPoint, \
stroke_width=LINE_STROKE, stroke=COLOR))
pT = {'iT':iT, 'T':T, 'loc':loc} # save the previous location so we can compute up/down
# now draw each mora (note: きゅ is one not two, while けっ is two not one.)
# by setting the font size here and the text anchor position we fix the alignment to be the
# center of the circles in the graph above
moras = master.add(dwg.g(id='moras',\
font_size=TEXT_HEIGHT,\
text_anchor='middle',\
fill=COLOR,
font_family=font_family))
for iM,M in enumerate(self.mora):
# each section
text = moras.add(dwg.text(M, insert=(iM*STEP_WIDTH, TEXT_OFFSET+TEXT_HEIGHT), id='moras'))
text.attribs['style']=('text-align:center;'+style)
# then try to save the file.
# !!! No error handling. I wrote it in an afternoon, this isn't production code.
# TODO: Add error handling.
dwg.save()
# This was supposed to be a class method, but my knowledge of python isn't that strong. It worked
# until I moved the class to it's own file. So now it's just a function in the same file. I think?
# TODO: Make sure I'm implementing a class method properly.
###
# The way this method works is it takes in a key (whatever you want) as a file name, a "word"
# meaning specifically the phonetic characters to describe the word (katakana or hiragana), and
# a code representing the pitch accent pattern and line connection preferences for the word.
#
# The code takes the form of comma seperated groups of downstep location followed by the mora length
# of the word. For example 勉強=べんきょう would be coded as 0/4. 案内=あんない would be 3/4. A phrase
# like 涙を拭く=なみだをふく would be 1/3,0/2 Note that the を is implicitly contained in the prior
# word. If you wanted to code 涙拭く without the を, you would need to mark the final symbol as being
# dropped by using a star. So the code for なみだふく is 1/3*,0/2. If you want to include
# the final mora following the word even without providing a symbol, set the includeFinalSymbol
# named argument in this function to True. This allows for odaka words to include a drop without
# a specific kana being inserted.
#
# Finally, if you wanted to drop the connecting line for some reason, then put a star prior to the
# downstep location. For example, I pulled the thumbnail from Dogen's 25th video with a split
# between べんきょう and して. To code this split use 0/4*,*0/2. This codes two heiban words, the
# first without a spot for the implied particle (as indicated by a trailing star), and a second word
# lacking a connection to the prior word (as indicated by a leading star).
def parseToneString(name,word,codes, includeFinalOpen=False):
moraList=[];
# First break up the word by characters
for indLetter,letter in enumerate(word):
primaryInsert = True;
codepoint = ord(letter);
# Then check for small characters in either Kana range
if (codepoint <= punctCode.KATAKANA_MAX and codepoint >= punctCode.KATAKANA_MIN):
codepoint = codepoint - punctCode.KATAKANA_OFFSET;
for modify in punctCode.MODIFY_SET:
if (codepoint == ord(modify)):
primaryInsert = False
break;
# TODO: Check for Kanji, punctuation, etc, and handle gracefully
# If the character is a "primary" character (i.e. a big Mora or a small っ)
# Then we will insert it directly.
if (primaryInsert or len(moraList) == 0):
moraList.append(letter);
# Otherwise we try to lump the special character with the previous character.
# The exception being if we started a word with it (why the hell would we do that anyway?)
else:
prevLetter = moraList.pop()
moraList.append(prevLetter+letter)
# Merge small mora into individual mora
# exempting the small 'っ'
### At this point moraList contains the word split by mora groups
### Now to assign high/low and filled data directly
# So the code is designed to corespond to the number pattern used in the Apple Dictionary and
# 三省堂スーパー大辞林. The first number indicates the downstep location followed by a slash
# telling this dumb parser where the end of the word is. We will insert a symbol for each mora
# specified by this code, with one EXTRA dot following the word representing the downstep
# location if a particle is attached. We ASSUME the next character is a particle, unless
# incidated otherwise. So a code of 1/2,1/2 would assume that we have 5 or 6 mora. 2 for the
# first wird, a particle, two for the second word, and an optional ending partile. If none is
# provided we truncate the drawing unless the downstep would occor on the non-provided particle.
# TODO: Provide a way to omit downstep in compound entries
# The way we re-encode this information is with a 4 state (2-bit) code:
# 00 lo filled
# 01 hi filled
# 10 lo empty
# 11 hi empty
# a leading 3rd bit is used to indicate skipped connecting lines
n=codes.split(',')
n_code=[];
for iCodeEntry,codeEntry in enumerate(n):
# TODO: Handle special codes for odd corner cases
# now split by slash
ceSplit = codeEntry.split('/');
dontDrawLine = 0b100 if (ceSplit[0][0] == '*' or iCodeEntry == 0) else 0;
includeParticle = not (ceSplit[1][-1] == '*'); # draw the particle unless we say skip it
downStep = int(ceSplit[0] if (ceSplit[0][0] != '*') else ceSplit[0][1:])
wordLength = int(ceSplit[1] if (includeParticle) else ceSplit[1][:-1]);
if (downStep == 0):
n_code.append(0b00 + dontDrawLine)
for x in range(wordLength-1):
n_code.append(0b01)
if (includeParticle):
n_code.append(0b11)
else:
if (downStep != 1):
n_code.append(0b00 + dontDrawLine)
else:
n_code.append(0b01 + dontDrawLine)
for x in range(downStep-1):
n_code.append(0b01)
for x in range(wordLength-downStep):
n_code.append(0b00)
if (includeParticle):
n_code.append(0b10)
# we can get a mismatch if we have a word with a hanging accent point for a particle
# that doesn't exist. If that's the case, then we will truncate the tone by default,
# otherwise we can force the insertion of a filler symbol. If the final point is NOT for
# an unused particle, we simply generate a mismatched graph, and warn the user. The mismatch
# in this case should be from a problem with the input strings.
if (len(moraList) == len(n_code)-1):
if (includeFinalOpen):
moraList.append('')
elif ((not includeFinalOpen) and (n_code[-1] & 0b10) != 0):
n_code.pop()
pc = punctCode(name, moraList, n_code)
if (not pc.isSafe()):
punctCode.warn('Count mismatch for \'%s\' (mora: %d, code: %d)' % \
(name, len(pc.mora), len(pc.tones)))
return pc