From 90368c778ad0576fb14c0ac2de6cf5f4d62dfeea Mon Sep 17 00:00:00 2001 From: Luke Date: Sun, 20 Aug 2017 17:33:19 -0700 Subject: [PATCH] Inital commit This commit contains some basic examples, and the core tool. --- example_codes.txt | 4 + examples/勉強して.svg | 2 + examples/銀行じゃない.svg | 2 + examples/電車.svg | 2 + examples/電車は.svg | 2 + parse_examples.py | 30 +++++ punctCode.py | 248 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 290 insertions(+) create mode 100644 example_codes.txt create mode 100644 examples/勉強して.svg create mode 100644 examples/銀行じゃない.svg create mode 100644 examples/電車.svg create mode 100644 examples/電車は.svg create mode 100644 parse_examples.py create mode 100644 punctCode.py diff --git a/example_codes.txt b/example_codes.txt new file mode 100644 index 0000000..0aa0bdf --- /dev/null +++ b/example_codes.txt @@ -0,0 +1,4 @@ +勉強して:べんきょうして:0/4*,*0/2 +銀行じゃない:ぎんこうじゃない:0/4,1/2 +電車は:でんしゃは:2/3 +電車:でんしゃ:2/3 diff --git a/examples/勉強して.svg b/examples/勉強して.svg new file mode 100644 index 0000000..1839484 --- /dev/null +++ b/examples/勉強して.svg @@ -0,0 +1,2 @@ + +きょ \ No newline at end of file diff --git a/examples/銀行じゃない.svg b/examples/銀行じゃない.svg new file mode 100644 index 0000000..a60e4f1 --- /dev/null +++ b/examples/銀行じゃない.svg @@ -0,0 +1,2 @@ + +じゃ \ No newline at end of file diff --git a/examples/電車.svg b/examples/電車.svg new file mode 100644 index 0000000..a6f0906 --- /dev/null +++ b/examples/電車.svg @@ -0,0 +1,2 @@ + +しゃ \ No newline at end of file diff --git a/examples/電車は.svg b/examples/電車は.svg new file mode 100644 index 0000000..554bde3 --- /dev/null +++ b/examples/電車は.svg @@ -0,0 +1,2 @@ + +しゃ \ No newline at end of file diff --git a/parse_examples.py b/parse_examples.py new file mode 100644 index 0000000..458ede9 --- /dev/null +++ b/parse_examples.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +import punctCode + +# Given a file name, loop line by line and convert the string of a Key (filename), phonetic word, +# plus the pitch accent code to a big list. +def parseToneFile(fn): + # TODO: Handle empty/invalid lines gracefully + print("Parsing file: \"%s\"" % fn) + dat = []; + for line in open(fn,'r'): + [key,w,n]=line.split(':') + print(" => importing \"%s\"" % key) + pc = punctCode.parseToneString(key,w,n.strip()) + dat.append(pc) + return dat; + +# Parse the list, pucntList is now a list of objects containing grouped mora, and the individual +# mora's high/low filled/unfilled graph symbol. +punctList = parseToneFile('example_codes.txt') + +print("Saving svgs...") +# For each one, dump an SVG +for pc in punctList: + # using default sizing. Units are in pixels. + filename = ('examples/%s.svg' % (pc.key)); + print(" => saving \"%s\" as \"%s\"" % (pc.key, filename)) + pc.toSVG(filename, style='font-weight:bold;',\ + padding_lr=30, padding_tb=15) + + diff --git a/punctCode.py b/punctCode.py new file mode 100644 index 0000000..fd05b87 --- /dev/null +++ b/punctCode.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +import svgwrite as svgw +# if you have Python-Package (pip), install the SVG dependancy with +# pip3 install svgwrite + +#### +# This script is a "simple" tool for generating pitch accent patern graphs as shown in Dogen's +# Patreon series (no affliation). + +class punctCode: + # Unicode Hiragana Range 3041 -> 309F + # Small Hirigana 30:41,43,45,47,49,83,85,87,8E + # Small Ka/Ke 95,96 + # Small Tsu 63 + # Unused Hirigana (we still allow) 30:90,91 + # Unicode Katakana Range (Hirigana + 96d [0x60]) + MODIFY_SET='ゃゅょぁぃぅぇぉ'; + HIRIGANA_MIN = 0x3041; + HIRIGANA_MAX = 0x309F; + KATAKANA_OFFSET = 0x60; + KATAKANA_MIN = HIRIGANA_MIN+KATAKANA_OFFSET; + KATAKANA_MAX = HIRIGANA_MAX+KATAKANA_OFFSET; + # TODO: Warn users if they give me Kanji and not kana! + + def __init__(self, key, mora, tones): + self.key = key + self.mora = mora + self.tones = tones + def __str__(self): + sm='' + for m in self.mora: + sm=sm+(',%s' % m) + st='' + for t in self.tones: + st=st+(',%d' % t) + return '<'+sm[1:]+':'+st[1:]+'>' + def __repr__(self): + return self.__str__() + def isSafe(self): # report if length's match + return len(self.mora) == len(self.tones) + def warn(s, ind=0): + print('=> WARNING: %s' % s) + def toSVG(self, outputFN, \ + dx=50, dy=None, rad=7, stroke=3, circ_stroke=None, \ + color='#000000', padding=25, padding_tb=None, padding_lr=None, \ + offset=20, \ + font_family='Noto Sans', font_height=27, style=''): + + # Now dow the variable renaming + # These variables control the core geometry of the image. You shouldn't ever need to change + # this code to configure your image. Simply use the various optional named arguments in the + # toSVG method above. + STEP_HEIGHT = dx + STEP_WIDTH = STEP_HEIGHT if dy == None else dy + LINE_STROKE = stroke + CIRC_STROKE = stroke if circ_stroke == None else circ_stroke + CIRC_RAD = rad + PADDING_TB = padding_tb if (padding_tb != None) else padding + PADDING_LR = padding_lr if (padding_lr != None) else padding + TEXT_OFFSET = offset + TEXT_HEIGHT = font_height + COLOR = color + + # SVG image placement to move us off of the origin. + MOVE_DOWN = PADDING_TB + STEP_HEIGHT; + MOVE_RIGHT = PADDING_LR; + # never no when you'll need 1/sqrt(2) (0.707). Much more critical than pi. + ISQRT2 = (2**-0.5) + + # Generate width and height for SVG. Generally the max function used here won't do anything + # as the two arguments are the same, but if we screw up this makes sure we can see the full + # result even in it's deranged form. + WIDTH = 2*PADDING_LR + STEP_WIDTH*(max(len(self.tones),len(self.mora))-1); + HEIGHT = 2*PADDING_TB + STEP_HEIGHT + TEXT_OFFSET + TEXT_HEIGHT; + + # Generate drawing workspace + dwg = svgw.Drawing(filename=outputFN, size=(WIDTH, HEIGHT)) + # put everything in a group + master = dwg.add(dwg.g(id='master_group')) + # use the group to fix the origin placement + master.translate(MOVE_RIGHT, MOVE_DOWN) + # put the graph points and lines in one group + graph = master.add(dwg.g(id='graph')) + # loop throug each mora point + for iT,T in enumerate(self.tones): + # location is one of two set heights, and a step to the right for each word. + loc = (iT*STEP_WIDTH,-(T%2)*STEP_HEIGHT) + # only mora that we've coded get filled circles. There are workarounds if you want + # everything or odd characters to be filled though. + currentFill = COLOR if (T & 0b10) == 0 else 'none' + # and put in that damn circle. + graph.add(dwg.circle(center=loc, r=CIRC_RAD, \ + stroke=COLOR, stroke_width=CIRC_STROKE, fill=currentFill)) + + if not (T & 0b100): # don't bother adding the connection line if bit 3 is set + # Draw a line between the current and previous point + startPoint = loc; + endPoint = pT['loc']; + if (loc[1] == pT['loc'][1]): # remove radus from each endPoint + startPoint = (startPoint[0] - CIRC_RAD, startPoint[1]) + endPoint = (endPoint[0] + CIRC_RAD, endPoint[1]) + else: # remove diagonal length + startPoint = (startPoint[0] - CIRC_RAD*ISQRT2, startPoint[1]) + endPoint = (endPoint[0] + CIRC_RAD*ISQRT2, endPoint[1]) + if (pT['loc'][1] > loc[1]): + startPoint = (startPoint[0], startPoint[1] + CIRC_RAD*ISQRT2) + endPoint = (endPoint[0], endPoint[1] - CIRC_RAD*ISQRT2) + else: + startPoint = (startPoint[0], startPoint[1] - CIRC_RAD*ISQRT2) + endPoint = (endPoint[0], endPoint[1] + CIRC_RAD*ISQRT2) + graph.add(dwg.line(start=startPoint, end=endPoint, \ + stroke_width=LINE_STROKE, stroke=COLOR)) + pT = {'iT':iT, 'T':T, 'loc':loc} # save the previous location so we can compute up/down + + # now draw each mora (note: きゅ is one not two, while けっ is two not one.) + # by setting the font size here and the text anchor position we fix the alignment to be the + # center of the circles in the graph above + moras = master.add(dwg.g(id='moras',\ + font_size=TEXT_HEIGHT,\ + text_anchor='middle',\ + fill=COLOR, + font_family=font_family)) + for iM,M in enumerate(self.mora): + # each section + text = moras.add(dwg.text(M, insert=(iM*STEP_WIDTH, TEXT_OFFSET+TEXT_HEIGHT), id='moras')) + text.attribs['style']=('text-align:center;'+style) + # then try to save the file. + # !!! No error handling. I wrote it in an afternoon, this isn't production code. + # TODO: Add error handling. + dwg.save() + +# This was supposed to be a class method, but my knowledge of python isn't that strong. It worked +# until I moved the class to it's own file. So now it's just a function in the same file. I think? +# TODO: Make sure I'm implementing a class method properly. + +### +# The way this method works is it takes in a key (whatever you want) as a file name, a "word" +# meaning specifically the phonetic characters to describe the word (katakana or hiragana), and +# a code representing the pitch accent pattern and line connection preferences for the word. +# +# The code takes the form of comma seperated groups of downstep location followed by the mora length +# of the word. For example 勉強=べんきょう would be coded as 0/4. 案内=あんない would be 3/4. A phrase +# like 涙を拭く=なみだをふく would be 1/3,0/2 Note that the を is implicitly contained in the prior +# word. If you wanted to code 涙拭く without the を, you would need to mark the final symbol as being +# dropped by using a star. So the code for なみだふく is 1/3*,0/2. If you want to include +# the final mora following the word even without providing a symbol, set the includeFinalSymbol +# named argument in this function to True. This allows for odaka words to include a drop without +# a specific kana being inserted. +# +# Finally, if you wanted to drop the connecting line for some reason, then put a star prior to the +# downstep location. For example, I pulled the thumbnail from Dogen's 25th video with a split +# between べんきょう and して. To code this split use 0/4*,*0/2. This codes two heiban words, the +# first without a spot for the implied particle (as indicated by a trailing star), and a second word +# lacking a connection to the prior word (as indicated by a leading star). +def parseToneString(name,word,codes, includeFinalOpen=False): + moraList=[]; + # First break up the word by characters + for indLetter,letter in enumerate(word): + primaryInsert = True; + codepoint = ord(letter); + # Then check for small characters in either Kana range + if (codepoint <= punctCode.KATAKANA_MAX and codepoint >= punctCode.KATAKANA_MIN): + codepoint = codepoint - punctCode.KATAKANA_OFFSET; + for modify in punctCode.MODIFY_SET: + if (codepoint == ord(modify)): + primaryInsert = False + break; + + # TODO: Check for Kanji, punctuation, etc, and handle gracefully + + # If the character is a "primary" character (i.e. a big Mora or a small っ) + # Then we will insert it directly. + if (primaryInsert or len(moraList) == 0): + moraList.append(letter); + # Otherwise we try to lump the special character with the previous character. + # The exception being if we started a word with it (why the hell would we do that anyway?) + else: + prevLetter = moraList.pop() + moraList.append(prevLetter+letter) + + # Merge small mora into individual mora + # exempting the small 'っ' + ### At this point moraList contains the word split by mora groups + + ### Now to assign high/low and filled data directly + # So the code is designed to corespond to the number pattern used in the Apple Dictionary and + # 三省堂スーパー大辞林. The first number indicates the downstep location followed by a slash + # telling this dumb parser where the end of the word is. We will insert a symbol for each mora + # specified by this code, with one EXTRA dot following the word representing the downstep + # location if a particle is attached. We ASSUME the next character is a particle, unless + # incidated otherwise. So a code of 1/2,1/2 would assume that we have 5 or 6 mora. 2 for the + # first wird, a particle, two for the second word, and an optional ending partile. If none is + # provided we truncate the drawing unless the downstep would occor on the non-provided particle. + # TODO: Provide a way to omit downstep in compound entries + + # The way we re-encode this information is with a 4 state (2-bit) code: + # 00 lo filled + # 01 hi filled + # 10 lo empty + # 11 hi empty + # a leading 3rd bit is used to indicate skipped connecting lines + + n=codes.split(',') + n_code=[]; + for iCodeEntry,codeEntry in enumerate(n): + # TODO: Handle special codes for odd corner cases + # now split by slash + ceSplit = codeEntry.split('/'); + dontDrawLine = 0b100 if (ceSplit[0][0] == '*' or iCodeEntry == 0) else 0; + includeParticle = not (ceSplit[1][-1] == '*'); # draw the particle unless we say skip it + downStep = int(ceSplit[0] if (ceSplit[0][0] != '*') else ceSplit[0][1:]) + wordLength = int(ceSplit[1] if (includeParticle) else ceSplit[1][:-1]); + if (downStep == 0): + n_code.append(0b00 + dontDrawLine) + for x in range(wordLength-1): + n_code.append(0b01) + if (includeParticle): + n_code.append(0b11) + else: + if (downStep != 1): + n_code.append(0b00 + dontDrawLine) + else: + n_code.append(0b01 + dontDrawLine) + + for x in range(downStep-1): + n_code.append(0b01) + for x in range(wordLength-downStep): + n_code.append(0b00) + if (includeParticle): + n_code.append(0b10) + + # we can get a mismatch if we have a word with a hanging accent point for a particle + # that doesn't exist. If that's the case, then we will truncate the tone by default, + # otherwise we can force the insertion of a filler symbol. If the final point is NOT for + # an unused particle, we simply generate a mismatched graph, and warn the user. The mismatch + # in this case should be from a problem with the input strings. + if (len(moraList) == len(n_code)-1): + if (includeFinalOpen): + moraList.append('○') + elif ((not includeFinalOpen) and (n_code[-1] & 0b10) != 0): + n_code.pop() + + pc = punctCode(name, moraList, n_code) + if (not pc.isSafe()): + punctCode.warn('Count mismatch for \'%s\' (mora: %d, code: %d)' % \ + (name, len(pc.mora), len(pc.tones))) + return pc +