圢æ çŽ è§£æãšã¯ãæç« ãåèªãªã©ã®æå°ã®æå³åäœã«åè§£ããããããã®åè©ãªã©ãåæããæè¡ã ãPythonã䜿ãã°ãæè»œã«åœ¢æ çŽ è§£æãå¯èœã«ãªãã圢æ çŽ è§£æãã§ãããšãããŸããŸãªèªç¶èšèªåŠçãè¡ãããããã§ã¯ã圢æ çŽ è§£æãå©çšããŠç°¡åãªææžæ ¡æ£ããŒã«ãäœæãããã
圢æ çŽ è§£æãšã¯ïŒ
圢æ çŽ è§£æãè¡ããšæç« ãæå°ã®æå³åäœã§ããã圢æ çŽ ãã«åå²ã§ãããäŸãã°ãããã®ç«ã®ååã¯ã¿ãã§ãããšããæç« ã¯ãããã® / ç« / ã® / åå / 㯠/ ã¿ã / ã§ããã®ããã«åå²ããããããããã®åœ¢æ çŽ ã«å¯ŸããŠããåè©ãããåè©ããšãã£ãåè©ã®è§£æãè¡ãããã
å ·äœçã«ã¯æ¬¡ã®ããã«ãªãã圢æ çŽ ããšã«åå²ãããã ãã§ãªããæç« äžã§èªãã©ã®ãããªåœ¹å²ãæãããŠããã®ããŸã§ææ¡ã§ããã
Pythonã§ã©ããã£ãŠåœ¢æ çŽ è§£æãããã®ïŒ
Pythonã®ãjanomeããšããããã±ãŒãžã䜿ãã°ãæè»œã«åœ¢æ çŽ è§£æãå®è¡ã§ãããjanomeãã€ã³ã¹ããŒã«ããã«ã¯ãã¿ãŒããã«ïŒWindowsã¯PowerShellãmacOSã¯ã¿ãŒããã«.appïŒãèµ·åããæ¬¡ã®ã³ãã³ããå®è¡ããã
$ pip install janome
次ã®ãããªããã°ã©ã ãäœæããããšã§ã圢æ çŽ è§£æãå®è¡ã§ãããããã°ã©ã ã¯ãexample.pyããšããååã§ä¿åãããã
from janome.tokenizer import Tokenizer
text = "ãã®ç«ã®ååã¯ã¿ãã§ã"
# Tokenizerãçæãã --- (â»1)
tokenizer = Tokenizer()
# 圢æ
çŽ è§£æããŠããŒã¯ã³ã«åå²ãã --- (â»2)
tokens = tokenizer.tokenize(text)
# çµæãåãã ããŠè¡šç€º --- (â»3)
for token in tokens:
surface = token.surface
pos = token.part_of_speech
print(f"| {surface} | {pos} |")
äžèšã®ããã°ã©ã ãå®è¡ããã«ã¯ãäžèšã®ãããªã³ãã³ããå®è¡ãããããã°ã©ã ãå®è¡ãããšãããã®ç«ã®ååã¯ã¿ãã§ãããšããæç« ã圢æ çŽ ã«åå²ããŠçµæã衚瀺ããã
$ python example.py
| ãã® | é£äœè©,*,*,* |
| ç« | åè©,äžè¬,*,* |
| ã® | å©è©,é£äœå,*,* |
| åå | åè©,äžè¬,*,* |
| 㯠| å©è©,ä¿å©è©,*,* |
| ã¿ã | åè©,åºæåè©,人å,å |
| ã§ã | å©åè©,*,*,* |
ããã°ã©ã ã®åéšåã確èªããŠã¿ããã(â»1)ã§ã¯ãjanomeã®Tokenizerãªããžã§ã¯ããçæããã(â»2)ã§ã¯ãtokenizeã¡ãœããã§åœ¢æ çŽ ãžã®åå²ãšèªã®è§£æãè¡ãã(â»3)ã§ã¯ãfor ... in ... æã䜿ã£ãŠãåå²ããã圢æ çŽ ãç»é¢ã«åºåããã
æç« ã®æ ¡æ£ããã°ã©ã ãäœã£ãŠã¿ãã
圢æ çŽ è§£æãå©çšããŠãç°¡åãªæç« ã®æ ¡æ£ããŒã«ãäœã£ãŠã¿ãããæ¬æ Œçãªãã®ãäœããšããã£ãšãããŸã«é·ããªã£ãŠããŸããããæ¥ç¶è©ã®ç¹°ãè¿ããã§ãã¯ãšãäžæã®é·ãããã§ãã¯ããæ©èœã ããäœã£ãŠ90è¡ã¡ãã£ãšã®ããã°ã©ã ãäœã£ãŠã¿ãã
äžæ°ã«æ²èŒããã«ã¯é·ããããããã°ã©ã å šäœã¯ãã¡ãã®Gist( https://gist.github.com/kujirahand/cb811206070506fc4cecee6ff4a62cd3 )ã«ã¢ããããã®ã§åŸããå šäœã確èªããŠã¿ããã
æåã«ãæç« ã®åé¡ãçºèŠãã颿°check_textã®å®çŸ©ãèŠãŠã¿ããã
import sys
from janome.tokenizer import Tokenizer
# 圢æ
çŽ è§£æã®ããã®Tokenizerã®ã€ã³ã¹ã¿ã³ã¹ãçæ --- (â»1)
tokenizer = Tokenizer()
# æ¥ç¶è©ã®äžèЧ --- (â»2)
setuzokusi = set()
def check_text(text):
"""ããã¹ãããã§ãã¯ãã"""
errors = []
# æ¹è¡ã³ãŒããçµ±äžãã --- (â»3)
text = text.replace("\r\n", "\n").replace("\r", "\n")
text = text.replace("\n", " ¶ ") # è¡æ°ãã«ãŠã³ãããããã ¶ããæ¿å
¥
text += " ¶ "
# 圢æ
çŽ è§£æããŠããŒã¯ã³ã«åå² --- (â»4)
tokens = []
for t in tokenizer.tokenize(text):
pos = t.part_of_speech
# æ¥ç¶è©ãæœåº --- (â»5)
if pos.startswith(("æ¥ç¶è©")):
setuzokusi.add(t.surface)
tokens.append(t.surface)
# äžæã®é·ãããã§ãã¯ãã --- (â»6)
errors += check_length(tokens)
# æ¥ç¶è©ã®ç¹°ãè¿ããæ€åº --- (â»7)
print("[INFO] æ¥ç¶è©ã®ç¹°ãè¿ãããã§ãã¯ããŠããŸã...", setuzokusi)
for word in setuzokusi:
errors += check_repeat(tokens, word, 8)
errors.sort(key=lambda x: int(x.split(":")[0])) # è¡çªå·ã§ãœãŒã
if len(errors) == 0:
print("[OK] ç¹ã«åé¡ã¯èŠã€ãããŸããã§ããã")
else:
for error in errors:
print(f"[ERROR] {error}")
return errors
äžèšã®ããã°ã©ã ã確èªãããã(â»1)ã§ã¯ã圢æ çŽ è§£æãè¡ãããã«ãjanomeã®Tokenizerã®ã€ã³ã¹ã¿ã³ã¹ãäœæããã(â»2)ã§ã¯ãæ¥ç¶è©ã®äžèЧãèŠããŠããããã®setåãåæåããã(â»3)ã§ã¯ãæ¹è¡ã³ãŒããçµ±äžããåŸããšã©ãŒã¬ããŒãã®ããã«æ¹è¡ã³ãŒãã䟿å®çã«èšå·ã¶ãã«çœ®ãæããŠããã
(â»4)ã§ã¯åœ¢æ çŽ è§£æãè¡ã£ãŠãæç« ã圢æ çŽ ã«åå²ããããããŠãæç« å šäœã確èªããŠ(â»5)ã§æ¥ç¶è©ãæ€åºãã倿°setuzokusiã«è¿œå ããã
(â»6)ã§ã¯ãäžæã®é·ãããã§ãã¯ããããã«ããã®åŸå®çŸ©ãã颿°check_lengthãåŒã³åºãããããŠã(â»7)ã§ã¯ã颿°check_repeatãåŒã³åºããŠãæ¥ç¶è©ã®ç¹°ãè¿ãããªããããã§ãã¯ããã
é·ãäžè¡ãæ€åºããŠãšã©ãŒã«ãã颿°ãäœãã
ããã§ã¯ã次ã«äžè¡ã®é·ãããã§ãã¯ãã颿°check_lengthã確èªããŠã¿ããããã®é¢æ°ã§ã¯å¥ç¹ãŸã§ã®æåæ°ã調ã¹ãŠã100æå以äžã ã£ãå Žåã«ããšã©ãŒãè¿ããšããä»çµã¿ã«ããŠããã
def check_length(tokens: list[str], max_length: int = 100):
"""äžæã®é·ãããã§ãã¯ãã"""
errors = []
s = ""
line_no = 1
for t in tokens:
if t in ["¶", "ã"]:
s_len = len(s)
if s_len > max_length:
errors.append(f"{line_no}: äžæãé·ãããŸã({s_len}æå)\n" + \
f" - {s[:30]}âŠ")
s = ""
if t == "¶":
line_no += 1
continue
s += t
return errors
ããã§ã¯ãäžæã®é·ãã調ã¹ãããã«ãæç« ã®æåããæ«å°ŸãŸã§ãæ¹è¡ãå¥ç¹ããããæ¢ããšããåŠçã«ããŠãããæã®åºåããèŠã€ããããã«ãŠã³ã¿çšã®å€æ°sããªã»ãããããšããåŠçã«ãªã£ãŠããããããŠãäžæã100å以äžã®å Žåã«ãšã©ãŒãåºããŠããã
é£ç¶ããæ¥ç¶è©ãæ€åºãã颿°ãäœãã
ç¶ããŠãé£ç¶ã§åºçŸããæ¥ç¶è©ã調ã¹ã颿°check_repeatã確èªãããããã®é¢æ°ã§ã¯ã8è¡ä»¥å ã«åãæ¥ç¶è©ãç»å Žããããšã©ãŒãè¿ããšããä»çµã¿ã«ããã
def check_repeat(tokens: list[str], word: str, limit: int = 8):
"""ç¹°ãè¿ãããã§ãã¯ãã"""
# è¿ãã«åãæ¥ç¶è©ãç»å Žããªãããã§ãã¯
last_line = 0
last_near = ""
line_no = 1
errors = []
for i, t in enumerate(tokens):
if t == "¶":
line_no += 1
continue
if t != word:
continue
if last_line == 0:
last_line = line_no
elif line_no - last_line <= limit:
near = "".join(tokens[i:i+10])
errors.append(
f"{line_no}:ã{word}ããé£ç¶ããŠããŸã\n" + \
f" -{last_line:4}è¡ç®: {last_near}âŠ\n" + \
f" -{line_no:4}è¡ç®: {near}âŠ")
last_line = line_no
last_near = "".join(tokens[i: i+10])
return errors
圢æ çŽ è§£æã§åå²ããåèª(tokens)ããäžããé ã«èª¿ã¹ãŠãã£ãŠè©²åœããæ¥ç¶è©ãåºãŠæ¥ããããã®äœçœ®ãã¡ã¢ã£ãŠããããããŠããã®åŸlimitè¡ä»¥å ã«åãæ¥ç¶è©ãèŠã€ãããããšã©ãŒã衚瀺ãããšããåŠçã«ãªã£ãŠããã
ç¶ãéšåã§ã¯ããã¡ã€ã«ãèªã¿åºããŠããã§ãã¯ãããšããåŠçã«ãªã£ãŠããã
def check_file(file_path):
"""ãã¡ã€ã«ããã§ãã¯ãã"""
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
# ããã¹ãããã§ãã¯ãã
return check_text(text)
if __name__ == "__main__":
if len(sys.argv) == 2:
# ã³ãã³ãã©ã€ã³åŒæ°ã§ãã¡ã€ã«åãæå®ãããå Žå
check_file(sys.argv[1])
else:
print("äœ¿ãæ¹: python proofreading.py [ãã¡ã€ã«å]")
ããã°ã©ã ãå®è¡ããŠã¿ãã
ãããŸã§ç޹ä»ããããã°ã©ã ããproofreading.pyããšããååã§ä¿åããããã¿ãŒããã«ã§æ¬¡ã®ãããªã³ãã³ããå®è¡ããããšã§ãããã¹ããã¡ã€ã«ã®åé¡ãæŽãåºãããšãã§ãããäŸãã°ä»¥äžã®å®è¡äŸã§ã¯ãtest.txtããšããæç« ããã§ãã¯ããã
python proofreading.py test.txt
æ¬é£èŒã®ç¬¬126åã®åçš¿ã«ãããšåé¡ã远å ããŠãtest.txtãã«ä¿åããŠãã詊ããŠã¿ããæ£ããåé¡ãå ±åã§ããã
GUIããŒã«ãäœã£ãŠå©äŸ¿æ§ã¢ãã
ãªããã³ãã³ãã©ã€ã³ããã ãšäœ¿ãã¥ããã£ãã®ã§ãé©åœãªGUIã®ç»é¢ãäœæããããã®ããã°ã©ã ããã¡ã( https://gist.github.com/kujirahand/cb811206070506fc4cecee6ff4a62cd3?permalinkcommentid=5591601#gistcomment-5591601 )ã«ã¢ãããããããã¹ãããã¯ã¹ã«æç« ãã³ããŒããŠãã§ãã¯ãã§ããã
ãã®ããŒã«ãå®è¡ããããã«ãã¿ãŒããã«ã§ãpip install TkEasyGUIããå®è¡ããŠãGUIã©ã€ãã©ãªã®TkEasyGUIãã€ã³ã¹ããŒã«ãããããããŠãäžèšã®ããã°ã©ã ãšåããã©ã«ãã«é 眮ããŠIDLEãªã©ã®ããŒã«ã§ããã°ã©ã ãèªã¿åºããŠå®è¡ããã°è¯ãã
ãŸãšã
以äžãä»åã¯åœ¢æ çŽ è§£æãè¡ã£ãŠãç°¡åãªææžæ ¡æ£ããŒã«ãäœã£ãŠã¿ããä»å玹ä»ããããã°ã©ã ã§ã¯ãç°¡åãªãã§ãã¯æ©èœãããªãããèªåã§ç¢ºèªãããé ç®ãã©ãã©ã远å ããããšã§ããã£ãããã¹ããã§ãã¯ããè³¢ãããŒã«ã«è²ãŠãŠããããšãã§ãããããããããŒã«ã®ããã°ã©ãã³ã°ã¯å®çšçã§æ¥œãããã®ãªã®ã ãæ¹è¯ããŠèªåã ãã®æ ¡æ£ããŒã«ãäœã£ãŠã¿ããšè¯ãã ããã
èªç±åããã°ã©ããŒããããã¯ãã©ã«ãŠãããã°ã©ãã³ã°ã®æ¥œãããäŒããæŽ»åãããŠããã代衚äœã«ãæ¥æ¬èªããã°ã©ãã³ã°èšèªããªã§ããã ãããã¹ã鳿¥œããµã¯ã©ããªã©ã2001幎ãªã³ã©ã€ã³ãœãã倧è³å ¥è³ã2004幎床æªèžãŠãŒã¹ ã¹ãŒããŒã¯ãªãšãŒã¿èªå®ã2010幎 OSSè²¢ç®è ç« åè³ããããŸã§50å以äžã®æè¡æžãå·çãããçŽè¿ã§ã¯ããå€§èŠæš¡èšèªã¢ãã«ã䜿ãããªãããã®ããã³ãããšã³ãžãã¢ãªã³ã°ã®æç§æž(ãã€ããåºç)ããPythonã§ã€ãããã¹ã¯ãããã¢ããª(ãœã·ã )ããå®è·µåã身ã«ã€ãã Pythonã®æç§æž 第2çããã·ãŽããã¯ãã©ã PythonèªååŠçã®æç§æž(ãã€ããåºç)ããªã©ã



