| | |
| | |
| |
|
| | import os |
| | import subprocess |
| | import gradio as gr |
| |
|
| | |
| | os.environ["WANDB_MODE"] = "disabled" |
| |
|
| | |
| | _default_mecab = "/usr/bin/mecab" if os.path.exists("/usr/bin/mecab") else "mecab" |
| | MECAB_BIN = os.getenv("MECAB_BIN", _default_mecab) |
| | os.environ["MECAB_BIN"] = MECAB_BIN |
| |
|
| | |
| | _model = None |
| | _exp_info = None |
| |
|
| |
|
| | def _ensure_model(): |
| | global _model, _exp_info |
| | if _model is None: |
| | from infer import load_model |
| |
|
| | result = load_model() |
| | if result is None: |
| | raise RuntimeError( |
| | "Model could not be loaded. Ensure sample_model/ exists with config.yaml and model.pt." |
| | ) |
| | _model, _exp_info = result |
| |
|
| |
|
| | def _to_mecab_lines(results, optimal_morphemes=None) -> str: |
| | |
| | def mecab_features(m): |
| | pos = m.get("pos", "*") |
| | pos1 = m.get("pos_detail1", "*") |
| | pos2 = m.get("pos_detail2", "*") |
| | ctype = m.get("inflection_type", "*") |
| | cform = m.get("inflection_form", "*") |
| | base = m.get("base_form", m.get("lemma", "*")) or "*" |
| | |
| | reading = m.get("reading", "*") or "*" |
| | return f"{pos},{pos1},{pos2},{ctype},{cform},{base},{reading}" |
| |
|
| | items = ( |
| | optimal_morphemes |
| | if optimal_morphemes |
| | else [ |
| | { |
| | "surface": r.get("surface", ""), |
| | "pos": r.get("pos", "*"), |
| | "pos_detail1": "*", |
| | "pos_detail2": "*", |
| | "inflection_type": "*", |
| | "inflection_form": "*", |
| | "base_form": r.get("surface", ""), |
| | "reading": r.get("reading", "*"), |
| | } |
| | for r in results |
| | ] |
| | ) |
| |
|
| | lines = [f"{m.get('surface','')}\t{mecab_features(m)}" for m in items] |
| | lines.append("EOS") |
| | return "\n".join(lines) |
| |
|
| |
|
| | def mecab_plain(text: str) -> str: |
| | """Run system MeCab and return its raw parsing (surface\tCSV ...\nEOS).""" |
| | try: |
| | from mecari.analyzers.mecab import MeCabAnalyzer |
| |
|
| | analyzer = MeCabAnalyzer() |
| | mecab_bin = os.getenv("MECAB_BIN", analyzer.mecab_bin) |
| | args = [mecab_bin] |
| | if isinstance(analyzer.jumandic_path, str) and os.path.isdir(analyzer.jumandic_path): |
| | args += ["-d", analyzer.jumandic_path] |
| | p = subprocess.run(args, input=text, text=True, capture_output=True) |
| | out = (p.stdout or "") + ("\n" + p.stderr if p.stderr else "") |
| | if p.returncode != 0: |
| | return out.strip() or f"mecab error rc={p.returncode}" |
| | |
| | lines = [] |
| | for line in out.splitlines(): |
| | if not line or line.strip() == "EOS": |
| | lines.append("EOS") |
| | continue |
| | if "\t" in line: |
| | surface, feats = line.split("\t", 1) |
| | parts = [s.strip() for s in feats.split(",")] |
| | trimmed = parts[:6] |
| | while len(trimmed) < 6: |
| | trimmed.append("*") |
| | lines.append(f"{surface}\t{','.join(trimmed)}") |
| | else: |
| | lines.append(line) |
| | |
| | if not lines or lines[-1] != "EOS": |
| | lines.append("EOS") |
| | return "\n".join(lines) |
| | except FileNotFoundError: |
| | return "MeCabバイナリが見つかりません(MECAB_BINやpackages.txtを確認)。" |
| | except Exception as e: |
| | return f"mecab実行時エラー: {e}" |
| |
|
| |
|
| | def analyze(text: str): |
| | if not text or not text.strip(): |
| | return "", "" |
| |
|
| | try: |
| | _ensure_model() |
| | from infer import predict_morphemes_from_text |
| |
|
| | text = text.strip() |
| | result = predict_morphemes_from_text(text, _model, _exp_info, silent=True) |
| | if not result: |
| | return "推論に失敗しました。", mecab_plain(text) |
| | results, optimal_morphemes = result |
| | mecari_out = _to_mecab_lines(results, optimal_morphemes) |
| | mecab_out = mecab_plain(text) |
| | return mecari_out, mecab_out |
| | except FileNotFoundError: |
| | return ( |
| | "MeCabが見つかりません。Spaceのpackages.txtに 'mecab' と 'mecab-jumandic-utf8' を含めてビルドし直すか、\n" |
| | "変数 MECAB_BIN=/usr/bin/mecab を設定してください。" |
| | ), "" |
| | except Exception as e: |
| | import traceback |
| |
|
| | tb = traceback.format_exc() |
| | return f"エラー: {e}\n\n{tb}", "" |
| |
|
| |
|
| | FONT_CSS = """ |
| | /* Prefer common system fonts for Latin text */ |
| | body, .gradio-container, .prose, textarea, input, button, |
| | .gr-text-input input, .gr-text-input textarea, .gr-textbox textarea { |
| | font-family: system-ui, -apple-system, 'Segoe UI', Roboto, 'Noto Sans', |
| | 'Helvetica Neue', Arial, 'Apple Color Emoji', 'Segoe UI Emoji', |
| | sans-serif !important; |
| | } |
| | """ |
| |
|
| | with gr.Blocks(theme=gr.themes.Soft(), css=FONT_CSS) as demo: |
| | gr.Markdown( |
| | """ |
| | # Mecari Morpheme Analyzer |
| | |
| | 形態素解析器"Mecari"のデモです。Googleが発表した手法の非公式再現実装です。GitHub: https://github.com/zbller/Mecari |
| | """ |
| | ) |
| |
|
| | with gr.Row(): |
| | inp = gr.Textbox(label="テキスト入力", value="外国人参政権", placeholder="とうきょうに行った", lines=3) |
| | btn = gr.Button("解析する") |
| | with gr.Row(): |
| | out_mecari = gr.Textbox(label="Mecari", lines=10) |
| | out_mecab = gr.Textbox(label="MeCab(Jumandic)", lines=10) |
| | gr.Examples( |
| | examples=[ |
| | ["とうきょうに行った"], |
| | ["吾輩わがはいは猫である。名前はまだ無い。"] |
| | ], |
| | inputs=inp, |
| | outputs=[out_mecari, out_mecab], |
| | fn=analyze, |
| | label="Good examples", |
| | run_on_click=True, |
| | cache_examples=False, |
| | ) |
| | gr.Examples( |
| | examples=[ |
| | ["すもももももももものうち"], |
| | ["こちら葛飾区亀有公園前派出所"] |
| | ], |
| | inputs=inp, |
| | outputs=[out_mecari, out_mecab], |
| | fn=analyze, |
| | label="Bad examples", |
| | run_on_click=True, |
| | cache_examples=False, |
| | ) |
| | btn.click(fn=analyze, inputs=inp, outputs=[out_mecari, out_mecab]) |
| |
|
| | |
| | def _warmup(): |
| | try: |
| | _ensure_model() |
| | except Exception: |
| | pass |
| |
|
| | _warmup() |
| |
|
| | if __name__ == "__main__": |
| | demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860"))) |
| |
|