# -*- coding: utf-8 -*-
"""
Bibliometrics + SLR + Gap & Keyword Gap + Clustering + Heatmaps (Patched, Safe, Excel Fallback)

Features:
1) Parse RIS/BibTeX folder → structured table + safe deduplication
2) Bibliometrics: CSV/PNG summary (+ XLSX if engine available)
3) Keyword & co-authorship networks (CSV for Gephi)
4) SLR: year+abstract filter, clustering (auto/kmeans/spectral), quality (silhouette), preview
   + SAVE SLR SUMMARY TO XLSX
5) Gap Finder: Theme×Method, Theme×Region, Method×Region (+ scores)
6) Keyword Connection Gap: popular keyword pairs that rarely/never co-occur,
   + matrix & HEATMAP of gap pairs
7) Region filter (based on region dictionary labels)
8) Heatmaps for keyword co-occurrence & all pivots
9) Log results to Processing Log + return OUTPUT_FOLDER (click in Result Viewer)

Notes:
- If xlsxwriter/openpyxl is unavailable, XLSX → CSV + README_NO_EXCEL.txt
- Method & region dictionaries can be overridden via CSV
"""

from qgis.PyQt.QtCore import QCoreApplication, QVariant
from qgis.core import (
    QgsProcessing,
    QgsProcessingAlgorithm,
    QgsProcessingException,
    QgsProcessingParameterFolderDestination,
    QgsProcessingParameterFile,
    QgsProcessingParameterString,
    QgsProcessingParameterBoolean,
    QgsProcessingParameterNumber,
    QgsProcessingParameterFeatureSink,
    QgsFeature,
    QgsFields,
    QgsField,
    QgsWkbTypes,
    QgsPointXY,
    QgsGeometry,
    QgsCoordinateReferenceSystem
)

import os, re, io, json, glob, itertools, math
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from collections import Counter
# networkx optional
try:
    import networkx as nx
    _HAS_NX = True
except Exception:
    _HAS_NX = False


class BibliometrikSLRGapFinderPatched(QgsProcessingAlgorithm):

    # --- parameter keys ---
    PARAM_INPUT_FOLDER = "INPUT_FOLDER"
    PARAM_FILE_PATTERN = "FILE_PATTERN"
    PARAM_YEAR_MIN = "YEAR_MIN"
    PARAM_YEAR_MAX = "YEAR_MAX"
    PARAM_MIN_EDGE_WEIGHT = "MIN_EDGE_WEIGHT"
    PARAM_DO_PLOTS = "DO_PLOTS"
    PARAM_OUTPUT_FOLDER = "OUTPUT_FOLDER"
    PARAM_COUNTRY_POINTS = "COUNTRY_POINTS"

    PARAM_METHOD_DICT_CSV = "METHOD_DICT_CSV"
    PARAM_REGION_DICT_CSV = "REGION_DICT_CSV"
    PARAM_MIN_COUNT_GAP = "MIN_COUNT_GAP"
    PARAM_URGENCY = "URGENCY"
    PARAM_EMPTINESS = "EMPTINESS"
    PARAM_FEASIBILITY = "FEASIBILITY"

    # keyword gap
    PARAM_KW_TOP_N = "KW_TOP_N"
    PARAM_KW_MIN_NODE_FREQ = "KW_MIN_NODE_FREQ"
    PARAM_KW_MAX_COOCC = "KW_MAX_COOCC"
    PARAM_KW_SCORE = "KW_SCORE"

    # clustering
    PARAM_CLUSTER_METHOD = "CLUSTER_METHOD"

    # region filter & heatmap control
    PARAM_REGION_FILTER = "REGION_FILTER"
    PARAM_HEATMAP_TOPN = "HEATMAP_TOPN"

    def tr(self, text):
        return QCoreApplication.translate("BibliometrikSLRGapFinderPatched", text)

    def createInstance(self):
        return BibliometrikSLRGapFinderPatched()

    def name(self):
        return "bibliometrik_slr_gapfinder_plus_heatmaps"

    def displayName(self):
        return self.tr("Bibliometric Analysis")

    def shortHelpString(self):
        return self.tr(
            "<b>Bibliometric Analysis, SLR, Network & Gap Finder Tool</b><br>"
            "<b>Developed by:</b> <br>"
            "<b>Firman Afrianto</b><br>"
            "Contributors:<br>"
            "- Nuryantiningsih Pusporini<br>"
            "- Annisa Dira Hariyanto<br>"
            "- Dimas Tri Rendragraha<br>"
            "- Johan Wahyu Panuntun<br>"
            "- Angga Anugerah Ardana<br>"
            "- Maya Safira<br>"
            "- Adipandang Yudono<br>"
            "- Muhammad Sani Roychansyah<br>"
            "- Yori Herwangi<br>"
            "- Kiky Permana Setiawan<br><br>"

            "This tool performs complete bibliometric analysis, Systematic Literature Review (SLR), "
            "keyword & co-authorship network extraction, theme–method–region gap detection, "
            "heatmap generation, keyword trends, and automatic country-based affiliation analysis.<br><br>"

            "<b>New Features Added:</b><br>"
            "- Automatic country extraction from RIS fields (C1, AD, C2).<br>"
            "- Country column added to Articles List (CSV/XLSX).<br>"
            "- Automatic Top 10 Countries chart (PNG).<br>"
            "- Optional creation of world country point layer using predefined centroids.<br>"
            "- Full compatibility with both RIS and BibTeX formats.<br><br>"

            "<b>Input:</b><br>"
            "- Folder containing *.RIS / *.BIB files.<br>"
            "- Optional region/method dictionary CSVs.<br>"
            "- Optional output sink for country point layer.<br><br>"

            "<b>Parameters:</b><br>"
            "<table border='1' cellspacing='0' cellpadding='3'>"
            "<tr><th>Name</th><th>Description</th><th>Default</th></tr>"
            "<tr><td>Input folder</td><td>Folder with RIS/BibTeX files</td><td>-</td></tr>"
            "<tr><td>File pattern</td><td>Pattern (e.g., *.ris;*.bib)</td><td>*.ris;*.RIS;*.bib;*.BIB</td></tr>"
            "<tr><td>Minimum year</td><td>Lower bound for SLR</td><td>1950</td></tr>"
            "<tr><td>Maximum year</td><td>Upper bound for SLR</td><td>2025</td></tr>"
            "<tr><td>Edge weight threshold</td><td>Minimum co-occurrence</td><td>2</td></tr>"
            "<tr><td>Create PNG charts</td><td>Generate figures</td><td>True</td></tr>"
            "<tr><td>Method dictionary CSV</td><td>Pattern → method mapping</td><td>(optional)</td></tr>"
            "<tr><td>Region dictionary CSV</td><td>Pattern → region mapping</td><td>(optional)</td></tr>"
            "<tr><td>Gap threshold</td><td>Max count for a gap cell</td><td>2</td></tr>"
            "<tr><td>Urgency weight</td><td>Gap urgency factor</td><td>1.0</td></tr>"
            "<tr><td>Emptiness weight</td><td>Scarcity weight</td><td>1.0</td></tr>"
            "<tr><td>Feasibility weight</td><td>Research feasibility factor</td><td>1.0</td></tr>"
            "<tr><td>Top keywords</td><td>Keyword limit</td><td>50</td></tr>"
            "<tr><td>Min keyword freq</td><td>Minimum occurrences</td><td>3</td></tr>"
            "<tr><td>Max co-occurrence</td><td>Upper limit for gap</td><td>1</td></tr>"
            "<tr><td>Keyword gap score</td><td>jaccard / pmi</td><td>jaccard</td></tr>"
            "<tr><td>Clustering method</td><td>auto / kmeans / spectral</td><td>auto</td></tr>"
            "<tr><td>Region filter</td><td>List of region labels</td><td>(all)</td></tr>"
            "<tr><td>Max heatmap size</td><td>Limit rows/cols</td><td>30</td></tr>"
            "<tr><td>Country points</td><td>Output layer for country centroids</td><td>(optional)</td></tr>"
            "<tr><td>Output folder</td><td>Destination folder</td><td>-</td></tr>"
            "</table><br><br>"

            "<b>Outputs:</b><br>"
            "- Bibliometric summary (Excel/CSV + PNG).<br>"
            "- Articles List including the new <i>country</i> field.<br>"
            "- Top 10 Countries chart (PNG).<br>"
            "- Optional world country point layer (EPSG:4326).<br>"
            "- Keyword & author networks (CSV/PNG).<br>"
            "- Keyword trend tables and charts.<br>"
            "- SLR clustering (summary, quality metrics, preview).<br>"
            "- Heatmaps for Theme–Method, Theme–Region, Method–Region.<br>"
            "- Gap Analysis (Excel/CSV + Markdown + PNG heatmaps).<br><br>"

            "<b>Notes:</b><br>"
            "- XLSX requires xlsxwriter/openpyxl (fallback = CSV).<br>"
            "- Country detection uses name matching inside affiliation text.<br>"
            "- Centroid dictionary must be filled by user for country point layer.<br>"
            "- All charts saved as PNG when enabled.<br>"
        )

    # ---------- Parameters ----------
    def initAlgorithm(self, config=None):
        self.addParameter(QgsProcessingParameterFile(
            self.PARAM_INPUT_FOLDER, self.tr("Input folder containing RIS or BibTeX files"),
            behavior=QgsProcessingParameterFile.Folder))
        self.addParameter(QgsProcessingParameterString(
            self.PARAM_FILE_PATTERN, self.tr("File pattern"), defaultValue="*.ris;*.RIS;*.bib;*.BIB"))
        self.addParameter(QgsProcessingParameterNumber(
            self.PARAM_YEAR_MIN, self.tr("Minimum year for SLR"),
            type=QgsProcessingParameterNumber.Integer, defaultValue=1950))
        self.addParameter(QgsProcessingParameterNumber(
            self.PARAM_YEAR_MAX, self.tr("Maximum year for SLR"),
            type=QgsProcessingParameterNumber.Integer, defaultValue=2025))
        self.addParameter(QgsProcessingParameterNumber(
            self.PARAM_MIN_EDGE_WEIGHT, self.tr("Network edge weight threshold"),
            type=QgsProcessingParameterNumber.Integer, defaultValue=2))
        self.addParameter(QgsProcessingParameterBoolean(
            self.PARAM_DO_PLOTS, self.tr("Create PNG charts"), defaultValue=True))
        self.addParameter(QgsProcessingParameterFile(
            self.PARAM_METHOD_DICT_CSV, self.tr("Method dictionary CSV (optional pattern,label)"),
            behavior=QgsProcessingParameterFile.File, optional=True))
        self.addParameter(QgsProcessingParameterFile(
            self.PARAM_REGION_DICT_CSV, self.tr("Region dictionary CSV (optional pattern,label,level)"),
            behavior=QgsProcessingParameterFile.File, optional=True))
        self.addParameter(QgsProcessingParameterNumber(
            self.PARAM_MIN_COUNT_GAP, self.tr("Article count threshold for a gap"),
            type=QgsProcessingParameterNumber.Integer, defaultValue=2))
        self.addParameter(QgsProcessingParameterNumber(
            self.PARAM_URGENCY, self.tr("Urgency weight"),
            type=QgsProcessingParameterNumber.Double, defaultValue=1.0))
        self.addParameter(QgsProcessingParameterNumber(
            self.PARAM_EMPTINESS, self.tr("Emptiness weight"),
            type=QgsProcessingParameterNumber.Double, defaultValue=1.0))
        self.addParameter(QgsProcessingParameterNumber(
            self.PARAM_FEASIBILITY, self.tr("Feasibility weight"),
            type=QgsProcessingParameterNumber.Double, defaultValue=1.0))

        # keyword gap params
        self.addParameter(QgsProcessingParameterNumber(
            self.PARAM_KW_TOP_N, self.tr("Top keywords to analyze for gaps"),
            type=QgsProcessingParameterNumber.Integer, defaultValue=50))
        self.addParameter(QgsProcessingParameterNumber(
            self.PARAM_KW_MIN_NODE_FREQ, self.tr("Minimum keyword frequency to be considered"),
            type=QgsProcessingParameterNumber.Integer, defaultValue=3))
        self.addParameter(QgsProcessingParameterNumber(
            self.PARAM_KW_MAX_COOCC, self.tr("Maximum keyword co-occurrence to be considered a gap (<=)"),
            type=QgsProcessingParameterNumber.Integer, defaultValue=1))
        self.addParameter(QgsProcessingParameterString(
            self.PARAM_KW_SCORE, self.tr("Gap score ('jaccard' or 'pmi')"),
            defaultValue="jaccard"))

        # clustering params
        self.addParameter(QgsProcessingParameterString(
            self.PARAM_CLUSTER_METHOD, self.tr("SLR clustering method ('auto', 'kmeans', or 'spectral')"),
            defaultValue="auto"))

        # region filter & heatmap
        self.addParameter(QgsProcessingParameterString(
            self.PARAM_REGION_FILTER,
            self.tr("Region filter (labels, comma-separated) — leave empty to include all"),
            defaultValue="",
            optional=True
        ))
        self.addParameter(QgsProcessingParameterNumber(
            self.PARAM_HEATMAP_TOPN,
            self.tr("Max rows/columns for heatmaps (to keep readable)"),
            type=QgsProcessingParameterNumber.Integer,
            defaultValue=30
        ))

        self.addParameter(QgsProcessingParameterFolderDestination(
            self.PARAM_OUTPUT_FOLDER, self.tr("Output folder")))

        # Optional: output layer titik negara (butuh kamus centroid negara)
        self.addParameter(QgsProcessingParameterFeatureSink(
            self.PARAM_COUNTRY_POINTS,
            self.tr("Output country points (optional)"),
            QgsProcessing.TypeVectorPoint
        ))

    # ---------- Helpers: Parsers ----------
    def _glob_many(self, folder, pattern_string):
        patterns = [p.strip() for p in pattern_string.split(";") if p.strip()]
        files = []
        for pat in patterns:
            # Tambahkan wildcard kalau user menulis ".ris" saja
            if not any(ch in pat for ch in ["*", "?", "["]):
                if pat.startswith("."):
                    pat = "*" + pat
                elif not pat.startswith("*"):
                    pat = "*" + pat
            files.extend(glob.glob(os.path.join(folder, pat)))
        return sorted(set(files))

    def parse_ris_folder(self, folder, patterns):
        ris_line = re.compile(r"^([A-Z0-9]{2})  - (.*)$")
        records_all, files = [], self._glob_many(folder, patterns)
        for fp in files:
            if not fp.lower().endswith(".ris"): continue
            current, last_tag = {}, None
            with io.open(fp, "r", encoding="utf-8", errors="ignore") as f:
                for raw in f:
                    line = raw.rstrip("\n")
                    m = ris_line.match(line)
                    if m:
                        tag, content = m.group(1), m.group(2).strip()
                        last_tag = tag
                        if tag in current:
                            if isinstance(current[tag], list): current[tag].append(content)
                            else: current[tag] = [current[tag], content]
                        else: current[tag] = content
                    else:
                        if last_tag is not None:
                            cont = line.strip()
                            if cont:
                                if isinstance(current[last_tag], list): current[last_tag][-1] += " " + cont
                                else: current[last_tag] += " " + cont
                    if line.startswith("ER  -"):
                        if current: records_all.append(current); current, last_tag = {}, None
            if current: records_all.append(current)
        return pd.DataFrame(records_all) if records_all else pd.DataFrame()

    def parse_bibtex_folder(self, folder, patterns):
        entries, files = [], self._glob_many(folder, patterns)
        for fp in files:
            if not fp.lower().endswith(".bib"): continue
            with io.open(fp, "r", encoding="utf-8", errors="ignore") as f:
                text = f.read()
            chunks = re.split(r"(?m)^@", text)
            for ch in chunks:
                ch = ch.strip()
                if not ch: continue
                m = re.match(r"([A-Za-z]+)\s*\{\s*([^,]+)\s*,", ch)
                if not m: continue
                body = ch[m.end():]
                fields = {}
                for fm in re.finditer(r"(\w+)\s*=\s*(\{(?:[^{}]|\{[^{}]*\})*\}|\"[^\"]*\")\s*,?", body, flags=re.S):
                    k = fm.group(1).lower(); v = fm.group(2).strip()
                    if v.startswith("{") and v.endswith("}"): v = v[1:-1]
                    if v.startswith('"') and v.endswith('"'): v = v[1:-1]
                    fields[k] = re.sub(r"\s+", " ", v).strip()
                entries.append({
                    "TI": fields.get("title"),
                    "AU": self._split_authors(fields.get("author")),
                    "JO": fields.get("journal") or fields.get("booktitle"),
                    "PY": fields.get("year"),
                    "KW": self._split_keywords(fields.get("keywords")),
                    "AB": fields.get("abstract"),
                    "DO": fields.get("doi"),
                    "UR": fields.get("url")
                })
        return pd.DataFrame(entries) if entries else pd.DataFrame()

    def _split_authors(self, s):
        if not s: return []
        parts = re.split(r"\s+and\s+|;", s)
        return [p.strip() for p in parts if p.strip()]

    def _split_keywords(self, s):
        if not s: return []
        parts = re.split(r";|,|\||/|\\", s)
        return [p.strip() for p in parts if p.strip()]

    # ---------- Helpers: General ----------
    def _as_list(self, x):
        if isinstance(x, list): return x
        if x is None or (isinstance(x, float) and np.isnan(x)): return []
        parts = re.split(r";|,|\||/|\\", str(x))
        return [p.strip() for p in parts if p.strip()]

    def _norm_text(self, x):
        if x is None or (isinstance(x, float) and np.isnan(x)): return None
        return re.sub(r"\s+", " ", str(x).strip().lower())

    def _extract_year_series(self, df):
        years = []
        for i in range(len(df)):
            y = df.at[i, "PY"] if "PY" in df.columns else None
            if not y and "Y1" in df.columns: y = df.at[i, "Y1"]
            if not y and "DA" in df.columns: y = df.at[i, "DA"]
            if isinstance(y, list): y = y[0] if y else None
            if y is None or (isinstance(y, float) and np.isnan(y)): years.append(None)
            else:
                m = re.search(r"\b(19|20)\d{2}\b", str(y))
                years.append(int(m.group(0)) if m else None)
        return years

    def _clean_kw_list(self, lst):
        """Normalize keyword list to lowercase, remove separators & extra spaces."""
        if lst is None: return []
        out = []
        if not isinstance(lst, list): lst = [lst]
        for kw in lst:
            if kw is None: continue
            parts = re.split(r";|,|\||/|\\", str(kw))
            for p in parts:
                t = re.sub(r"\s+", " ", p).strip().lower()
                if t: out.append(t)
        return out

    def _safe_div(self, a, b):
        return (a / b) if (b not in (0, None) and a is not None) else 0.0

    # ---------- Helpers: Co-occurrence ----------
    def _cooccurrence(self, lists, min_weight=2):
        edge_counter, node_counter = Counter(), Counter()
        for lst in lists:
            uniq = sorted(set([k for k in lst if k]))
            for k in uniq: node_counter[k] += 1
            for i in range(len(uniq)):
                for j in range(i+1, len(uniq)):
                    a, b = uniq[i], uniq[j]; edge_counter[(a,b)] += 1
        edges = [(a,b,w) for (a,b),w in edge_counter.items() if w >= min_weight]
        nodes_df = pd.DataFrame(list(node_counter.items()), columns=["id","freq"]).sort_values("freq", ascending=False)
        edges_df = pd.DataFrame(edges, columns=["source","target","weight"]).sort_values("weight", ascending=False)
        return nodes_df, edges_df

    # ---------- Helpers: Plots ----------
    def _plot_barh(self, df_sub, xcol, ycol, title, outpng):
        plt.figure(figsize=(8, 4.5))
        plt.barh(df_sub[ycol][::-1], df_sub[xcol][::-1])
        plt.title(title); plt.xlabel(xcol); plt.ylabel(ycol)
        plt.tight_layout(); plt.savefig(outpng, dpi=160); plt.close()

    def _plot_line(self, x, y, title, outpng, xlabel, ylabel):
        plt.figure(figsize=(8, 4.5))
        plt.plot(x, y, marker="o")
        plt.title(title); plt.xlabel(xlabel); plt.ylabel(ylabel)
        plt.grid(True, axis="y", linewidth=0.4)
        plt.tight_layout(); plt.savefig(outpng, dpi=160); plt.close()

    def _plot_heatmap(self, df_mat, title, outpng, vmax=None, highlight_topn=0):
        """Plot a heatmap from a matrix DataFrame (index & columns are labels).
           highlight_topn > 0 will overlay red on Top-N cells above the diagonal.
        """
        if df_mat is None or df_mat.empty:
            return

        import numpy as np
        import matplotlib.pyplot as plt
        import pandas as pd

        arr = df_mat.values.astype(float)

        # Safe color scaling to avoid “flat” appearance
        if vmax is None:
            vmax = np.nanmax(arr)
            if not np.isfinite(vmax) or vmax <= 0:
                vmax = 1.0  # still visible even if all zeros

        plt.figure(figsize=(8, 6))
        ax = plt.gca()
        im = ax.imshow(arr, aspect="auto", interpolation="nearest",
                       vmin=0, vmax=vmax, cmap="Greys", zorder=1)
        ax.set_title(title)
        ax.set_xticks(range(len(df_mat.columns)))
        ax.set_yticks(range(len(df_mat.index)))
        ax.set_xticklabels(df_mat.columns, rotation=90, fontsize=7)
        ax.set_yticklabels(df_mat.index, fontsize=7)
        plt.colorbar(im, fraction=0.046, pad=0.04)

        # === Highlight Top-N (typically for GAP heatmap) ===
        if highlight_topn and highlight_topn > 0 and df_mat.shape[0] >= 2 and df_mat.shape[1] >= 2:
            pairs = []
            # use only cells above the diagonal to avoid duplicates
            for i in range(df_mat.shape[0]):
                for j in range(i+1, df_mat.shape[1]):
                    score = arr[i, j]
                    pairs.append((i, j, score))
            if pairs:
                pairs.sort(key=lambda x: x[2], reverse=True)
                for (i, j, _) in pairs[:highlight_topn]:
                    ax.add_patch(plt.Rectangle((j-0.5, i-0.5), 1, 1, fill=True, color="red", alpha=0.45, zorder=2))
                    ax.add_patch(plt.Rectangle((i-0.5, j-0.5), 1, 1, fill=True, color="red", alpha=0.45, zorder=2))

        plt.tight_layout()
        plt.savefig(outpng, dpi=160)
        plt.close()

    def _plot_network(self, nodes_df, edges_df, outpng, title="Network", node_id_col="id",
                      node_freq_col="freq", src_col="source", tgt_col="target", w_col="weight",
                      max_nodes=200, max_edges_fallback=1000, feedback=None):
        """
        Readable network plot:
        - Only nodes that have edges
        - Keep the largest component (giant component)
        - kamada_kawai layout (fallback: spring)
        - Thinner, smoother edges
        - LABEL ALL NODES, node size ∝ betweenness
        """
        if nodes_df is None or nodes_df.empty or edges_df is None or edges_df.empty:
            if feedback: feedback.pushInfo("[Network] nodes/edges empty — skip plot.")
            return False
        if not _HAS_NX:
            if feedback: feedback.pushInfo("[Network] 'networkx' not available — skip plot.")
            return False

        try:
            if feedback: feedback.pushInfo(f"[Network] Start: nodes={len(nodes_df)}, edges={len(edges_df)}")
        except Exception:
            pass

        # --- Select candidate nodes (top-N by freq)
        nd = nodes_df.copy()
        if node_freq_col in nd.columns:
            nd = nd.sort_values(node_freq_col, ascending=False).head(max_nodes)

        ed = edges_df.copy()
        ed[src_col] = ed[src_col].astype(str)
        ed[tgt_col] = ed[tgt_col].astype(str)

        keep_nodes = set(nd[node_id_col].astype(str))
        ed_filt = ed[ed[src_col].isin(keep_nodes) & ed[tgt_col].isin(keep_nodes)]

        # --- Fallback: Top-M edges if empty
        if ed_filt.empty:
            if feedback: feedback.pushInfo("[Network] Fallback: Top-M edges by weight.")
            ed_filt = ed.sort_values(w_col, ascending=False).head(max_edges_fallback) if w_col in ed.columns else ed.head(max_edges_fallback)

        # --- Keep nodes that actually have edges
        nodes_with_edges = set(ed_filt[src_col]).union(set(ed_filt[tgt_col]))
        nd = nd[nd[node_id_col].astype(str).isin(nodes_with_edges)].copy()
        if nd.empty:
            if feedback: feedback.pushInfo("[Network] No nodes with edges — abort plot.")
            return False

        # --- Build graph
        import networkx as nx, numpy as np, matplotlib.pyplot as plt
        G = nx.Graph()
        for r in nd.itertuples(index=False):
            nid = str(getattr(r, node_id_col))
            nf = float(getattr(r, node_freq_col)) if node_freq_col in nd.columns else 1.0
            G.add_node(nid, freq=nf)
        for r in ed_filt.itertuples(index=False):
            u, v = str(getattr(r, src_col)), str(getattr(r, tgt_col))
            if u == v:
                continue
            w = float(getattr(r, w_col)) if w_col in ed_filt.columns else 1.0
            if G.has_edge(u, v): G[u][v]["weight"] += w
            else: G.add_edge(u, v, weight=w)

        # --- Keep largest component
        if G.number_of_edges() == 0 or G.number_of_nodes() == 0:
            if feedback: feedback.pushInfo("[Network] Graph empty after construction.")
            return False
        try:
            if nx.is_connected(G):
                G_main = G
            else:
                cc = list(nx.connected_components(G))
                if not cc:
                    if feedback: feedback.pushInfo("[Network] No connected component.")
                    return False
                largest_cc = max(cc, key=len)
                G_main = G.subgraph(largest_cc).copy()
            G = G_main
        except Exception:
            pass

        if G.number_of_edges() == 0 or G.number_of_nodes() == 0:
            if feedback: feedback.pushInfo("[Network] Largest component empty.")
            return False

        # --- Layout (primary: kamada_kawai; fallback: spring)
        try:
            pos = nx.kamada_kawai_layout(G, weight="weight")
        except Exception:
            try:
                pos = nx.spring_layout(G, k=None, iterations=60, seed=42, weight="weight")
            except Exception:
                pos = nx.spring_layout(G, seed=42)

        # --- Centrality for node size (BETWEENNESS) + edge visual scale
        btw = nx.betweenness_centrality(G, weight="weight", normalized=True)
        btw_vals = np.array([btw.get(n, 0.0) for n in G.nodes()])
        if btw_vals.max() == 0:
            btw_vals = btw_vals + 1e-6  # avoid division by zero
        node_sizes = 180 + 520 * (btw_vals / btw_vals.max())                  # ~180–700 px

        e_w = np.array([G[u][v].get("weight", 1.0) for u, v in G.edges()])
        if e_w.size == 0: e_w = np.array([1.0])
        edge_widths = 0.6 + 2.8 * (e_w / e_w.max())                           # 0.6–3.4 px

        # --- Plot
        plt.figure(figsize=(10.5, 7.5))
        nx.draw_networkx_edges(G, pos, width=edge_widths, alpha=0.35, edge_color="gray")
        nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color="#2a9d8f")
        nx.draw_networkx_labels(G, pos, font_size=8)                          # LABEL ALL NODES

        plt.title(title); plt.axis("off"); plt.tight_layout()
        plt.savefig(outpng, dpi=180); plt.close()

        # --- Centrality output (returned, now includes betweenness)
        try:
            deg = dict(G.degree()); wdeg = dict(G.degree(weight="weight"))
            df_cent = pd.DataFrame({
                "node": list(G.nodes()),
                "degree": [deg.get(n,0) for n in G.nodes()],
                "wdegree": [wdeg.get(n,0.0) for n in G.nodes()],
                "betweenness": [btw.get(n,0.0) for n in G.nodes()],
                "freq": [G.nodes[n].get("freq", np.nan) for n in G.nodes()]
            }).sort_values(["betweenness","wdegree","degree"], ascending=False)
        except Exception:
            df_cent = pd.DataFrame({"node": list(G.nodes())})

        return df_cent

    # ---------- Helpers: Excel Writer Fallback ----------
    def _get_excel_writer(self, path):
        try:
            import xlsxwriter  # noqa
            return pd.ExcelWriter(path, engine="xlsxwriter")
        except Exception:
            try:
                import openpyxl  # noqa
                return pd.ExcelWriter(path, engine="openpyxl")
            except Exception:
                return None
    
    # ---------- Country centroid dictionary (user editable) ----------
    def _country_coord_dict(self):
        """
        Mengembalikan kamus: nama negara → (lon, lat) dalam EPSG:4326.

        PENTING:
        - Mas isi sendiri daftar negara dan koordinat centroid-nya.
        - Contoh format:

              return {
                  "Indonesia": (113.9213, -0.7893),
                  "Japan": (138.2529, 36.2048),
              }

        - Kalau dibiarkan kosong ({}), analisis negara & layer titik
          tidak akan dibuat (aman, tidak error).
        """
        return {
            "Afghanistan": (67.709953, 33.93911),
            "Akrotiri and Dhekelia": (33.708, 34.597),
            "Åland": (19.896, 60.178),
            "Albania": (20.1683, 41.1533),
            "Algeria": (1.6596, 28.0339),
            "American Samoa": (-170.702, -14.271),
            "Andorra": (1.5218, 42.5063),
            "Angola": (17.8739, -11.2027),
            "Anguilla": (-63.0686, 18.2206),
            "Antigua and Barbuda": (-61.7964, 17.0608),
            "Argentina": (-63.6167, -38.4161),
            "Armenia": (45.0382, 40.0691),
            "Aruba": (-69.9683, 12.5211),
            "Australia": (133.7751, -25.2744),
            "Austria": (14.5501, 47.5162),
            "Azerbaijan": (47.5769, 40.1431),
            "Bahamas": (-77.3963, 25.0343),
            "Bahrain": (50.5577, 26.0667),
            "Bangladesh": (90.3563, 23.685),
            "Barbados": (-59.5432, 13.1939),
            "Belarus": (27.9534, 53.7098),
            "Belgium": (4.4699, 50.5039),
            "Belize": (-88.4976, 17.1899),
            "Benin": (2.3158, 9.3077),
            "Bermuda": (-64.7505, 32.3078),
            "Bhutan": (90.4336, 27.5142),
            "Bolivia": (-63.5887, -16.2902),
            "Bonaire, Saint Eustatius and Saba": (-68.2667, 12.1500),
            "Bosnia and Herzegovina": (17.6791, 43.9159),
            "Botswana": (24.6849, -22.3285),
            "Bouvet Island": (3.413, -54.420),
            "Brazil": (-51.9253, -14.235),
            "British Indian Ocean Territory": (72.424, -7.318),
            "British Virgin Islands": (-64.623, 18.4207),
            "Brunei": (114.7277, 4.5353),
            "Bulgaria": (25.4858, 42.7339),
            "Burkina Faso": (-1.5616, 12.2383),
            "Burundi": (29.9189, -3.3731),
            "Cambodia": (104.991, 12.5657),
            "Cameroon": (12.3547, 7.3697),
            "Canada": (-106.3468, 56.1304),
            "Cabo Verde": (-23.6052, 16.5388),
            "Caspian Sea": (51.628, 41.677),
            "Cayman Islands": (-81.2546, 19.3133),
            "Central African Republic": (20.9394, 6.6111),
            "Chad": (18.7322, 15.4542),
            "Chile": (-71.543, -35.6751),
            "China": (104.1954, 35.8617),
            "Christmas Island": (105.6904, -10.4475),
            "Clipperton Island": (-109.2167, 10.2833),
            "Cocos Islands": (96.87096, -12.1642),
            "Colombia": (-74.2973, 4.5709),
            "Comoros": (43.3333, -11.6455),
            "Cook Islands": (-159.7777, -21.2367),
            "Costa Rica": (-83.7534, 9.7489),
            "Côte d’Ivoire": (-5.5471, 7.54),
            "Croatia": (15.2, 45.1),
            "Cuba": (-77.7812, 21.5218),
            "Curaçao": (-68.99, 12.1696),
            "Cyprus": (33.4299, 35.1264),
            "Czechia": (15.4729, 49.8175),
            "Democratic Republic of the Congo": (21.7587, -4.0383),
            "Denmark": (9.5018, 56.2639),
            "Djibouti": (43.145, 11.8251),
            "Dominica": (-61.3709, 15.415),
            "Dominican Republic": (-70.1627, 18.7357),
            "Ecuador": (-78.1834, -1.8312),
            "Egypt": (30.8025, 26.8206),
            "El Salvador": (-88.8965, 13.7942),
            "Equatorial Guinea": (10.2679, 1.6508),
            "Eritrea": (39.7823, 15.1794),
            "Estonia": (25.0136, 58.5953),
            "Ethiopia": (40.4897, 9.145),
            "Falkland Islands": (-59.5236, -51.7963),
            "Faroe Islands": (-6.9118, 61.8926),
            "Fiji": (178.065, -17.7134),
            "Finland": (25.7482, 61.9241),
            "France": (2.2137, 46.2276),
            "French Guiana": (-53.1258, 3.9339),
            "French Polynesia": (-149.4068, -17.6797),
            "French Southern Territories": (69.3486, -49.2804),
            "Gabon": (11.6094, -0.8037),
            "Gambia": (-15.3101, 13.4432),
            "Georgia": (43.3569, 42.3154),
            "Germany": (10.4515, 51.1657),
            "Ghana": (-1.0232, 7.9465),
            "Gibraltar": (-5.3454, 36.1408),
            "Greece": (21.8243, 39.0742),
            "Greenland": (-42.6043, 71.7069),
            "Grenada": (-61.679, 12.1165),
            "Guadeloupe": (-61.551, 16.265),
            "Guam": (144.7937, 13.4443),
            "Guatemala": (-90.2308, 15.7835),
            "Guernsey": (-2.5853, 49.4657),
            "Guinea": (-9.6966, 9.9456),
            "Guinea-Bissau": (-15.1804, 11.8037),
            "Guyana": (-58.9302, 4.8604),
            "Haiti": (-72.2852, 18.9712),
            "Heard Island and McDonald Islands": (73.5042, -53.0947),
            "Honduras": (-86.2419, 15.2),
            "Hong Kong": (114.1095, 22.3964),
            "Hungary": (19.5033, 47.1625),
            "Iceland": (-19.0208, 64.9631),
            "India": (78.9629, 20.5937),
            "Indonesia": (113.9213, -0.7893),
            "Iran": (53.688, 32.4279),
            "Iraq": (43.6793, 33.2232),
            "Ireland": (-8.2439, 53.4129),
            "Isle of Man": (-4.5481, 54.2361),
            "Israel": (34.8516, 31.0461),
            "Italy": (12.5674, 41.8719),
            "Jamaica": (-77.2975, 18.1096),
            "Japan": (138.2529, 36.2048),
            "Jersey": (-2.1312, 49.2144),
            "Jordan": (36.2384, 30.5852),
            "Kazakhstan": (66.9237, 48.0196),
            "Kenya": (37.9062, -0.0236),
            "Kiribati": (-157.362, 1.8709),
            "Kosovo": (20.902, 42.6026),
            "Kuwait": (47.4818, 29.3117),
            "Kyrgyzstan": (74.7661, 41.2044),
            "Laos": (102.4955, 19.8563),
            "Latvia": (24.6032, 56.8796),
            "Lebanon": (35.8623, 33.8547),
            "Lesotho": (28.2336, -29.6099),
            "Liberia": (-9.4295, 6.4281),
            "Libya": (17.228, 26.3351),
            "Liechtenstein": (9.5554, 47.166),
            "Lithuania": (23.8813, 55.1694),
            "Luxembourg": (6.1296, 49.8153),
            "Macao": (113.5439, 22.1987),
            "Macedonia": (21.7453, 41.6086),
            "Madagascar": (46.8691, -18.7669),
            "Malawi": (34.3015, -13.2543),
            "Malaysia": (101.9758, 4.2105),
            "Maldives": (73.2207, 3.2028),
            "Mali": (-3.9962, 17.5707),
            "Malta": (14.3754, 35.9375),
            "Marshall Islands": (171.1845, 7.1315),
            "Martinique": (-61.0242, 14.6415),
            "Mauritania": (-10.9408, 21.0079),
            "Mauritius": (57.5522, -20.3484),
            "Mayotte": (45.1662, -12.8275),
            "Mexico": (-102.5528, 23.6345),
            "Micronesia": (150.5508, 7.4256),
            "Moldova": (28.3699, 47.4116),
            "Monaco": (7.4246, 43.7384),
            "Mongolia": (103.8467, 46.8625),
            "Montenegro": (19.3744, 42.7087),
            "Montserrat": (-62.1874, 16.7425),
            "Morocco": (-7.0926, 31.7917),
            "Mozambique": (35.5296, -18.6657),
            "Myanmar": (95.956, 21.9162),
            "Namibia": (18.4904, -22.9576),
            "Nauru": (166.932, -0.5228),
            "Nepal": (84.124, 28.3949),
            "Netherlands": (5.2913, 52.1326),
            "New Caledonia": (165.618, -20.9043),
            "New Zealand": (174.886, -40.9006),
            "Nicaragua": (-85.2072, 12.8654),
            "Niger": (8.0817, 17.6078),
            "Nigeria": (8.6753, 9.082),
            "Niue": (-169.8672, -19.0544),
            "Norfolk Island": (167.9547, -29.0408),
            "North Korea": (127.5101, 40.3399),
            "Northern Cyprus": (32.8932, 35.1833),
            "Northern Mariana Islands": (145.757, 15.1064),
            "Norway": (8.4689, 60.472),
            "Oman": (55.9232, 21.5126),
            "Pakistan": (69.3451, 30.3753),
            "Palau": (134.5825, 7.51498),
            "Palestine": (35.2332, 31.9522),
            "Panama": (-80.7821, 8.538),
            "Papua New Guinea": (143.9555, -6.3149),
            "Paraguay": (-58.4438, -23.4425),
            "Peru": (-75.0152, -9.19),
            "Philippines": (122.888, 11.0046),
            "Pitcairn Islands": (-130.1015, -25.0664),
            "Poland": (19.1451, 51.9194),
            "Portugal": (-8.2245, 39.3999),
            "Puerto Rico": (-66.5901, 18.2208),
            "Qatar": (51.1839, 25.3548),
            "Republic of the Congo": (15.8277, -0.228),
            "Réunion": (55.5364, -21.1151),
            "Romania": (24.9668, 45.9432),
            "Russia": (105.3188, 61.524),
            "Rwanda": (29.8739, -1.9403),
            "Saint-Barthélemy": (-62.8333, 17.9),
            "Saint-Martin": (-63.0333, 18.0833),
            "Saint Helena, Ascension and Tristan da Cunha": (-10.0307, -15.965),
            "Saint Kitts and Nevis": (-62.782, 17.3578),
            "Saint Lucia": (-60.9789, 13.9094),
            "Saint Pierre and Miquelon": (-56.2656, 46.8852),
            "Saint Vincent and the Grenadines": (-61.2872, 13.2528),
            "Samoa": (-172.1046, -13.759),
            "San Marino": (12.4578, 43.9424),
            "São Tomé and Príncipe": (6.6131, 0.1864),
            "Saudi Arabia": (45.0792, 23.8859),
            "Senegal": (-14.4524, 14.4974),
            "Serbia": (21.0059, 44.0165),
            "Seychelles": (55.492, -4.6796),
            "Sierra Leone": (-11.7799, 8.4606),
            "Singapore": (103.8198, 1.3521),
            "Slovakia": (19.699, 48.669),
            "Slovenia": (14.9955, 46.1512),
            "Solomon Islands": (160.154, -9.6457),
            "Somalia": (46.1996, 5.1521),
            "South Africa": (22.9375, -30.5595),
            "South Georgia and the South Sandwich Islands": (-36.5, -54.5),
            "South Korea": (127.7669, 35.9078),
            "South Sudan": (31.3069, 6.8769),
            "Spain": (-3.7492, 40.4637),
            "Sri Lanka": (80.7718, 7.8731),
            "Sudan": (30.2176, 12.8628),
            "Suriname": (-56.0278, 3.9193),
            "Svalbard and Jan Mayen": (23.6703, 77.5536),
            "Swaziland": (31.4659, -26.5225),
            "Sweden": (18.6435, 60.1282),
            "Switzerland": (8.2275, 46.8182),
            "Syria": (38.9968, 34.8021),
            "Taiwan": (120.9605, 23.6978),
            "Tajikistan": (71.2761, 38.861),
            "Tanzania": (34.8888, -6.369),
            "Thailand": (100.9925, 15.870),
            "Timor-Leste": (125.7275, -8.8742),
            "Togo": (0.8248, 8.6195),
            "Tokelau": (-171.8559, -9.1676),
            "Tonga": (-175.1982, -21.1789),
            "Trinidad and Tobago": (-61.2225, 10.6918),
            "Tunisia": (9.5375, 33.8869),
            "Turkey": (35.2433, 38.9637),
            "Turkmenistan": (59.5563, 38.9697),
            "Turks and Caicos Islands": (-71.7979, 21.694),
            "Tuvalu": (178.6799, -7.1095),
            "Uganda": (32.2903, 1.3733),
            "Ukraine": (31.1656, 48.3794),
            "United Arab Emirates": (54.3773, 23.4241),
            "United Kingdom": (-3.4359, 55.3781),
            "United States": (-95.7129, 37.0902),
            "United States Minor Outlying Islands": (-162.057, 19.296),
            "Uruguay": (-55.7658, -32.5228),
            "Uzbekistan": (64.5853, 41.3774),
            "Vanuatu": (166.959, -15.3767),
            "Vatican City": (12.4534, 41.9029),
            "Venezuela": (-66.5897, 6.4238),
            "Vietnam": (108.2772, 14.0583),
            "Virgin Islands, U.S.": (-64.8963, 18.3358),
            "Wallis and Futuna": (-177.1561, -13.7687),
            "Western Sahara": (-13.125, 24.2155),
            "Yemen": (48.5164, 15.5527),
            "Zambia": (27.8493, -13.1339),
            "Zimbabwe": (29.1549, -19.0154)
        }

    # ---------- Default Dictionaries ----------
    def _default_method_dict(self):
        # Comprehensive list for Urban & Regional Planning
        rows = [
            # Basic econometrics
            ("regression","Regression"),("linear regression","Regression"),("logistic regression","Regression"),
            ("time series","Time Series"),("arima","ARIMA"),("sarima","SARIMA"),("var","VAR"),
            ("panel data","Panel Data"),("cross section","Cross-Section"),

            # Policy evaluation
            ("difference in differences","DiD"),("differences in differences","DiD"),("difference-in-differences","DiD"),
            ("synthetic control","Synthetic Control"),("propensity score","Propensity Score Matching"),

            # Spatial econometrics
            ("spatial regression","Spatial Regression"),("spatial lag","Spatial Lag"),("sar","Spatial Lag"),
            ("spatial error","Spatial Error"),("sem","Spatial Error"),("sdm","Spatial Durbin"),
            ("spatial econometric","Spatial Econometrics"),

            # GWR
            ("gwr","GWR"),("mgwr","MGWR"),("geographically weighted regression","GWR"),
            ("geographically weighted panel","GW Panel"),

            # IO & CGE
            ("input output","Input–Output"),("cge","Computable General Equilibrium"),

            # System dynamics & ABM
            ("system dynamics","System Dynamics"),
            ("agent based","Agent-Based Model"),("abm","Agent-Based Model"),

            # Land use change & CA
            ("cellular automata","Cellular Automata"),("ca","Cellular Automata"),
            ("land use change","Land Use Change Model"),("urban growth model","Urban Growth Model"),
            ("markov","Markov Chain"),

            # Machine learning
            ("random forest","Random Forest"),("decision tree","Decision Tree"),("classification tree","Decision Tree"),
            ("gradient boosting","Gradient Boosting"),("xgboost","Gradient Boosting"),
            ("lightgbm","Gradient Boosting"),("catboost","Gradient Boosting"),
            ("svm","SVM"),("support vector","SVM"),("knn","KNN"),

            # Neural nets & deep learning
            ("neural network","Artificial Neural Network"),("ann","Artificial Neural Network"),
            ("mlp","Artificial Neural Network"),
            ("deep learning","Deep Learning"),("cnn","Deep Learning"),("lstm","LSTM"),

            # Advanced statistics
            ("bayesian","Bayesian Model"),("monte carlo","Monte Carlo Simulation"),
            ("simulation","Simulation"),("scenario analysis","Scenario Analysis"),
            ("forecasting","Forecasting"),

            # Big data & mining
            ("big data","Big Data Analytics"),("data mining","Data Mining"),
            ("text mining","Text Mining"),("sentiment analysis","Sentiment Analysis"),

            # Clustering & dimension reduction
            ("cluster analysis","Clustering"),("kmeans","Clustering"),("dbscan","Clustering"),
            ("hierarchical clustering","Clustering"),
            ("pca","Principal Component Analysis"),("factor analysis","Factor Analysis"),
            ("multivariate analysis","Multivariate Analysis"),

            # MCDA/MCDM
            ("topsis","MCDA-TOPSIS"),("ahp","MCDA-AHP"),("anp","MCDA-ANP"),
            ("saw","MCDA-SAW"),("promethee","MCDA-PROMETHEE"),("electre","MCDA-ELECTRE"),
            ("mcda","Multi-Criteria Decision Analysis"),("mcdm","Multi-Criteria Decision Making"),
            ("multi objective","Multi-Objective Optimization"),

            # Optimization
            ("linear programming","Optimization"),("integer programming","Optimization"),
            ("goal programming","Optimization"),

            # Network & spatial interaction
            ("network analysis","Network Analysis"),("graph theory","Network Analysis"),
            ("spatial network","Spatial Network"),
            ("space syntax","Space Syntax"),
            ("accessibility model","Accessibility"),
            ("gravity model","Gravity Model"),("spatial interaction","Spatial Interaction"),
            ("location allocation","Location-Allocation"),("facility location","Facility Location"),

            # Land suitability
            ("land suitability","Land Suitability Analysis"),
            ("mce","Multi-Criteria Evaluation"),("mca","Multi-Criteria Analysis"),

            # GIS & RS
            ("gis analysis","GIS"),("gis","GIS"),
            ("geospatial","GIS"),
            ("remote sensing","Remote Sensing"),("satellite imagery","Remote Sensing"),
            ("earth observation","Remote Sensing"),
            ("ndvi","Remote Sensing"),("lcz","Local Climate Zone"),

            # Urban morphology
            ("urban morphology","Urban Morphology"),
            ("fractal","Fractal Analysis"),
            ("entropy","Entropy Analysis"),

            # Equity & inequality
            ("equity analysis","Equity/Justice Analysis"),("justice analysis","Equity/Justice Analysis"),
            ("gini","Gini Index"),("lorenz","Lorenz Curve"),("theil","Theil Index"),
            ("spatial inequality","Spatial Inequality"),

            # Social networks
            ("social network analysis","Social Network Analysis"),("sna","Social Network Analysis"),

            # Qualitative & participatory
            ("survey","Survey"),("questionnaire","Survey"),("interview","Qualitative"),
            ("focus group","Qualitative"),("case study","Case Study"),("comparative study","Comparative Study"),
            ("mixed methods","Mixed Methods"),("qualitative","Qualitative"),("quantitative","Quantitative"),
            ("participatory mapping","Participatory Mapping"),
            ("ppgis","Participatory GIS"),("pgis","Participatory GIS"),
            ("vgi","Volunteered Geographic Information"),("crowdsourcing","Crowdsourcing"),
            ("openstreetmap","OSM"),
            ("big earth data","Big Earth Data")
        ]
        return pd.DataFrame(rows, columns=["pattern","label"])

    def _default_region_dict(self):
        rows = [
            ("indonesia","Indonesia","country"),
            ("jakarta","DKI Jakarta","province"),
            ("jawa timur","Jawa Timur","province"),("jawa barat","Jawa Barat","province"),
            ("jawa tengah","Jawa Tengah","province"),("di yogyakarta","DIY","province"),
            ("sumatera utara","Sumatera Utara","province"),("sumatera barat","Sumatera Barat","province"),
            ("riau","Riau","province"),("kalimantan timur","Kalimantan Timur","province"),
            ("kalimantan selatan","Kalimantan Selatan","province"),("kalimantan barat","Kalimantan Barat","province"),
            ("sulawesi selatan","Sulawesi Selatan","province"),("bali","Bali","province"),
            ("ntt","NTT","province"),("ntb","NTB","province"),("papua","Papua","province"),
            ("malang","Malang","city"),("surabaya","Surabaya","city"),("bandung","Bandung","city"),
            ("yogyakarta","Yogyakarta","city"),("semarang","Semarang","city"),("medan","Medan","city"),
            ("makassar","Makassar","city"),("palembang","Palembang","city"),("bengkulu","Bengkulu","city"),
            ("samarinda","Samarinda","city"),("balikpapan","Balikpapan","city"),("bogor","Bogor","city"),("batu","Batu","city"),
            ("east java","Jawa Timur","province"),("west java","Jawa Barat","province"),("central java","Jawa Tengah","province")
        ]
        return pd.DataFrame(rows, columns=["pattern","label","level"])

    def _load_method_dict(self, csv_path):
        if csv_path and os.path.isfile(csv_path):
            df = pd.read_csv(csv_path)
            df = df.rename(columns={c: c.strip().lower() for c in df.columns})
            if {"pattern","label"}.issubset(df.columns):
                df["pattern"] = df["pattern"].astype(str).str.lower().str.strip()
                df["label"] = df["label"].astype(str).str.strip()
                return df[["pattern","label"]]
        return self._default_method_dict()

    def _load_region_dict(self, csv_path):
        if csv_path and os.path.isfile(csv_path):
            df = pd.read_csv(csv_path)
            df = df.rename(columns={c: c.strip().lower() for c in df.columns})
            if {"pattern","label","level"}.issubset(df.columns):
                df["pattern"] = df["pattern"].astype(str).str.lower().str.strip()
                df["label"] = df["label"].astype(str).str.strip()
                df["level"] = df["level"].astype(str).str.strip()
                return df[["pattern","label","level"]]
        return self._default_region_dict()

    def _detect_methods(self, text, method_df):
        found, t = set(), text.lower()
        for _, row in method_df.iterrows():
            if row["pattern"] and row["pattern"] in t: found.add(row["label"])
        return sorted(found)

    def _detect_regions(self, text, region_df):
        found, t = set(), text.lower()
        for _, row in region_df.iterrows():
            if row["pattern"] and row["pattern"] in t: found.add((row["label"], row["level"]))
        return sorted(list(set([lab for lab, lvl in found])))

    # ---------- Main ----------
    def processAlgorithm(self, parameters, context, feedback):
        in_folder = self.parameterAsFile(parameters, self.PARAM_INPUT_FOLDER, context)
        pattern = self.parameterAsString(parameters, self.PARAM_FILE_PATTERN, context)
        year_min = int(self.parameterAsInt(parameters, self.PARAM_YEAR_MIN, context))
        year_max = int(self.parameterAsInt(parameters, self.PARAM_YEAR_MAX, context))
        min_edge = int(self.parameterAsInt(parameters, self.PARAM_MIN_EDGE_WEIGHT, context))
        do_plots = bool(self.parameterAsBool(parameters, self.PARAM_DO_PLOTS, context))
        out_folder = self.parameterAsFileOutput(parameters, self.PARAM_OUTPUT_FOLDER, context)

        method_csv = self.parameterAsFile(parameters, self.PARAM_METHOD_DICT_CSV, context)
        region_csv = self.parameterAsFile(parameters, self.PARAM_REGION_DICT_CSV, context)
        min_count_gap = int(self.parameterAsInt(parameters, self.PARAM_MIN_COUNT_GAP, context))
        w_urg = float(self.parameterAsDouble(parameters, self.PARAM_URGENCY, context))
        w_emp = float(self.parameterAsDouble(parameters, self.PARAM_EMPTINESS, context))
        w_fea = float(self.parameterAsDouble(parameters, self.PARAM_FEASIBILITY, context))

        kw_top_n = int(self.parameterAsInt(parameters, self.PARAM_KW_TOP_N, context))
        kw_min_node_freq = int(self.parameterAsInt(parameters, self.PARAM_KW_MIN_NODE_FREQ, context))
        kw_max_coocc = int(self.parameterAsInt(parameters, self.PARAM_KW_MAX_COOCC, context))
        kw_score_mode = (self.parameterAsString(parameters, self.PARAM_KW_SCORE, context) or "jaccard").strip().lower()

        cluster_method = (self.parameterAsString(parameters, self.PARAM_CLUSTER_METHOD, context) or "auto").strip().lower()

        region_filter_raw = (self.parameterAsString(parameters, self.PARAM_REGION_FILTER, context) or "").strip()
        heatmap_topn = int(self.parameterAsInt(parameters, self.PARAM_HEATMAP_TOPN, context))
        region_filter = [r.strip() for r in region_filter_raw.split(",") if r.strip()]

        # --- safe init to avoid UnboundLocalError ---
        df_gap_mat = pd.DataFrame()
        path_kw_gap_mat = ""
        path_kw_gap_heatmap = None


        if not os.path.isdir(in_folder):
            raise QgsProcessingException(self.tr("Invalid input folder"))
        os.makedirs(out_folder, exist_ok=True)

        # Read data
        df_ris = self.parse_ris_folder(in_folder, pattern)
        df_bib = self.parse_bibtex_folder(in_folder, pattern)
        if df_ris.empty and df_bib.empty:
            raise QgsProcessingException(self.tr("No RIS or BibTeX data could be read"))
        df_raw = pd.concat([df_ris, df_bib], ignore_index=True, sort=False)

        # Safe normalization
        n = len(df_raw)
        def safe_series(colname): return df_raw[colname] if colname in df_raw.columns else pd.Series([None]*n)

        s_t1, s_ti = safe_series("T1"), safe_series("TI")
        df_raw["title"] = s_t1.where(~s_t1.isna(), s_ti)

        s_jo, s_t2 = safe_series("JO"), safe_series("T2")
        df_raw["journal"] = s_jo.where(~s_jo.isna(), s_t2)

        df_raw["year"] = self._extract_year_series(df_raw)

        if "AU" in df_raw.columns: df_raw["authors"] = safe_series("AU").apply(self._as_list)
        else: df_raw["authors"] = pd.Series([[]]*n)
        if "KW" in df_raw.columns: df_raw["keywords"] = safe_series("KW").apply(self._as_list)
        else: df_raw["keywords"] = pd.Series([[]]*n)

        df_raw["doi"] = safe_series("DO")
        df_raw["url"] = safe_series("UR")
        df_raw["abstract"] = safe_series("AB")
        df_raw["authors"] = df_raw["authors"].apply(lambda v: v if isinstance(v, list) else self._as_list(v))
        df_raw["keywords"] = df_raw["keywords"].apply(lambda v: v if isinstance(v, list) else self._as_list(v))

        # Deduplication
        dk = df_raw["doi"].astype(str).str.lower().fillna("")
        dk = dk + "||" + df_raw["title"].apply(lambda x: self._norm_text(x) or "")
        dk = dk + "||" + pd.Series(df_raw["year"]).fillna(-1).astype(int).astype(str)
        df_raw["dupe_key"] = dk
        df = df_raw.drop_duplicates(subset=["dupe_key"]).copy()

        # Basic bibliometrics
        total_records = len(df)
        years = pd.Series(df["year"]).dropna().astype(int)
        pub_per_year_df = years.value_counts().sort_index().rename_axis("Year").reset_index(name="Publications")
        top_journals = df["journal"].dropna().value_counts().head(20).rename_axis("Journal").reset_index(name="Count")
        all_authors = list(itertools.chain.from_iterable(df["authors"].tolist()))
        all_authors = [re.sub(r"\s+", " ", str(a).strip()) for a in all_authors if a]
        top_authors = pd.Series(all_authors).value_counts().head(20).rename_axis("Author").reset_index(name="Count")
        df["kw_norm"] = df["keywords"].apply(self._clean_kw_list)
        all_kws = list(itertools.chain.from_iterable(df["kw_norm"].tolist()))
        top_keywords = pd.Series(all_kws).value_counts().head(30).rename_axis("Keyword").reset_index(name="Frequency")

        # Optional: deteksi negara dari affiliation (tag C1 di RIS) pakai kamus negara
                # Optional: deteksi negara dari affiliation (C1 / AD / C2 di RIS) pakai kamus negara
        country_coords = self._country_coord_dict()
        country_summary = None
        df["affiliation_raw"] = None
        df["country"] = None

        # kita cari kolom afiliasi yang ada di df_raw
        aff_cols = [c for c in ["C1", "AD", "C2"] if c in df_raw.columns]

        if country_coords and aff_cols:
            try:
                # gabungkan semua kolom afiliasi menjadi satu string per dupe_key
                def _join_aff(row):
                    parts = []
                    for c in aff_cols:
                        v = row.get(c)
                        if isinstance(v, list):
                            parts.extend([str(x) for x in v if x])
                        elif v is not None and not (isinstance(v, float) and math.isnan(v)):
                            parts.append(str(v))
                    return " ; ".join(parts) if parts else None

                df_raw["AFF_ALL"] = df_raw.apply(_join_aff, axis=1)

                # Petakan dupe_key -> AFF_ALL, lalu tempel ke df yang sudah dedup
                aff_map = (
                    df_raw[["dupe_key", "AFF_ALL"]]
                    .drop_duplicates(subset=["dupe_key"])
                    .set_index("dupe_key")["AFF_ALL"]
                )

                def _detect_country(text):
                    if not isinstance(text, str):
                        return None
                    t = text.lower()
                    for cname in country_coords.keys():
                        if cname.lower() in t:
                            return cname
                    return None

                df["affiliation_raw"] = df["dupe_key"].map(aff_map)
                df["country"] = df["affiliation_raw"].apply(_detect_country)

                if "country" in df.columns:
                    country_summary = (
                        df["country"]
                        .dropna()
                        .value_counts()
                        .rename_axis("Country")
                        .reset_index(name="Count")
                    )
            except Exception:
                country_summary = None

        # Networks
        kw_nodes, kw_edges = self._cooccurrence(df["kw_norm"].tolist(), min_weight=min_edge)
        auth_lists = df["authors"].apply(lambda lst: [re.sub(r"\s+", " ", str(a).strip()) for a in lst if a])
        auth_nodes, auth_edges = self._cooccurrence(auth_lists.tolist(), min_weight=min_edge)

        # --- Keyword & Author Network Visualizations (optional if networkx available) ---
        keyword_net_png = os.path.join(out_folder, "keyword_network.png")
        author_net_png = os.path.join(out_folder, "author_network.png")
        keyword_centrality_csv = os.path.join(out_folder, "keyword_network_centrality.csv")
        author_centrality_csv = os.path.join(out_folder, "author_network_centrality.csv")

        if _HAS_NX:
            try:
                df_cent_kw = self._plot_network(
                    kw_nodes, kw_edges, keyword_net_png,
                    title="Keyword Network",
                    node_id_col="id", node_freq_col="freq",
                    src_col="source", tgt_col="target", w_col="weight",
                    max_nodes=200, max_edges_fallback=500,
                    feedback=feedback
                )
                if isinstance(df_cent_kw, pd.DataFrame) and not df_cent_kw.empty:
                    df_cent_kw.to_csv(keyword_centrality_csv, index=False)
            except Exception as e:
                feedback.pushInfo(f"[WARN] Failed to create keyword network plot: {e}")

            try:
                df_cent_auth = self._plot_network(
                    auth_nodes, auth_edges, author_net_png,
                    title="Author Co-authorship Network",
                    node_id_col="id", node_freq_col="freq",
                    src_col="source", tgt_col="target", w_col="weight",
                    max_nodes=200, max_edges_fallback=500,
                    feedback=feedback
                )
                if isinstance(df_cent_auth, pd.DataFrame) and not df_cent_auth.empty:
                    df_cent_auth.to_csv(author_centrality_csv, index=False)
            except Exception as e:
                feedback.pushInfo(f"[WARN] Failed to create author network plot: {e}")


        # ===== Keyword Connection Gap Finder =====
        kw_freq = {row.id: int(row.freq) for row in kw_nodes.itertuples(index=False)} if not kw_nodes.empty else {}
        edge_w = {}
        if not kw_edges.empty:
            for r in kw_edges.itertuples(index=False):
                a, b, w = str(r.source), str(r.target), int(r.weight)
                if a > b: a, b = b, a
                edge_w[(a, b)] = w

        cand_nodes = []
        if not kw_nodes.empty:
            for row in kw_nodes.itertuples(index=False):
                if int(row.freq) >= kw_min_node_freq:
                    cand_nodes.append(row.id)
                if len(cand_nodes) >= kw_top_n:
                    break

        total_docs = max(1, len(df))
        gap_pairs = []
        for i in range(len(cand_nodes)):
            for j in range(i+1, len(cand_nodes)):
                a, b = cand_nodes[i], cand_nodes[j]
                aa, bb = (a, b) if a < b else (b, a)
                co = int(edge_w.get((aa, bb), 0))
                if co <= kw_max_coocc:
                    fa, fb = int(kw_freq.get(a, 0)), int(kw_freq.get(b, 0))
                    if kw_score_mode == "pmi":
                        pa = self._safe_div(fa, total_docs)
                        pb = self._safe_div(fb, total_docs)
                        pab = self._safe_div(co, total_docs)
                        gap_score = (fa + fb) if pab == 0 else -math.log(self._safe_div(pab, (pa * pb))) if pa>0 and pb>0 else (fa+fb)
                    else:
                        gap_score = (fa + fb) - (2 * co)
                    gap_pairs.append({"kw_a": a, "freq_a": fa, "kw_b": b, "freq_b": fb, "coocc": co, "gap_score": float(gap_score)})

        keyword_gap_df = pd.DataFrame(gap_pairs).sort_values(
            ["gap_score","freq_a","freq_b"], ascending=[False, False, False]
        ) if len(gap_pairs) > 0 else pd.DataFrame(columns=["kw_a","freq_a","kw_b","freq_b","coocc","gap_score"])

        path_kw_gap = os.path.join(out_folder, "keyword_gap_pairs.csv")
        keyword_gap_df.to_csv(path_kw_gap, index=False)

        # Co-occurrence matrix for candidate keywords + HEATMAP
        idx = {k:i for i,k in enumerate(cand_nodes)}
        mat = np.zeros((len(cand_nodes), len(cand_nodes)), dtype=int)
        for (a,b),w in edge_w.items():
            if a in idx and b in idx:
                i, j = idx[a], idx[b]
                mat[i,j] = w; mat[j,i] = w
        path_kw_mat = os.path.join(out_folder, "keyword_cooccurrence_matrix.csv")
        path_kw_mat_png = os.path.join(out_folder, "keyword_cooccurrence_heatmap.png")

        df_mat = None  # <<< initialize to prevent UnboundLocalError
        if len(cand_nodes) > 0:
            df_mat = pd.DataFrame(mat, index=cand_nodes, columns=cand_nodes)
            if df_mat.shape[0] > heatmap_topn:
                df_mat = df_mat.iloc[:heatmap_topn, :heatmap_topn]
            df_mat.to_csv(path_kw_mat, index=True)
            if do_plots and not df_mat.empty:
                self._plot_heatmap(df_mat, "Keyword Co-occurrence (Top nodes)", path_kw_mat_png, vmax=None, highlight_topn=0)
                if feedback:
                    feedback.pushInfo(f"[Plot] Co-occurrence heatmap: {path_kw_mat_png} | shape={df_mat.shape}")

        else:
            # Pastikan df_mat bukan None agar aman saat di-log
            df_mat = pd.DataFrame()
            df_mat.to_csv(path_kw_mat, index=True)
            path_kw_mat_png = ""  # just empty (not created)
            if feedback:
                feedback.pushInfo("[Plot] Co-occurrence heatmap: skipped (no candidate nodes)")

        # === Keyword GAP MATRIX & HEATMAP (values = gap_score for gap pairs) ===
        path_kw_gap_mat = os.path.join(out_folder, "keyword_gap_matrix.csv")
        path_kw_gap_heatmap = os.path.join(out_folder, "keyword_gap_heatmap.png")

        if not keyword_gap_df.empty:
            # pick nodes appearing in gap pairs, sort by total frequency
            nodes_in_gaps = list(dict.fromkeys(
                list(keyword_gap_df["kw_a"]) + list(keyword_gap_df["kw_b"])
            ))
            nodes_in_gaps = sorted(nodes_in_gaps, key=lambda k: kw_freq.get(k,0), reverse=True)
            if len(nodes_in_gaps) > heatmap_topn:
                nodes_in_gaps = nodes_in_gaps[:heatmap_topn]

            # build symmetric gap_score matrix
            gap_index = {k:i for i,k in enumerate(nodes_in_gaps)}
            G = np.zeros((len(nodes_in_gaps), len(nodes_in_gaps)), dtype=float)
            for r in keyword_gap_df.itertuples(index=False):
                a,b,score = r.kw_a, r.kw_b, float(r.gap_score)
                if a in gap_index and b in gap_index:
                    i,j = gap_index[a], gap_index[b]
                    G[i,j] = score; G[j,i] = score

            df_gap_mat = pd.DataFrame(G, index=nodes_in_gaps, columns=nodes_in_gaps)
            df_gap_mat.to_csv(path_kw_gap_mat)

            # highlight top 10 gaps with red overlay
            if do_plots and df_gap_mat.values.size > 0:
                self._plot_heatmap(
                    df_gap_mat,
                    "Keyword GAP Pairs (gap_score)",
                    path_kw_gap_heatmap,
                    highlight_topn=10
                )
        else:
            # no gaps → still create empty file & avoid UnboundLocalError
            df_gap_mat = pd.DataFrame()
            df_gap_mat.to_csv(path_kw_gap_mat)
            path_kw_gap_heatmap = None

        # Save bibliometric tables/Excel
        path_pub_year = os.path.join(out_folder, "publications_per_year.csv")
        path_top_journals = os.path.join(out_folder, "top_journals.csv")
        path_top_authors = os.path.join(out_folder, "top_authors.csv")
        path_top_keywords = os.path.join(out_folder, "top_keywords.csv")
        path_kw_nodes = os.path.join(out_folder, "keyword_nodes.csv")
        path_kw_edges = os.path.join(out_folder, "keyword_edges.csv")
        path_auth_nodes = os.path.join(out_folder, "author_nodes.csv")
        path_auth_edges = os.path.join(out_folder, "author_edges.csv")
        path_excel = os.path.join(out_folder, "bibliometrics_summary.xlsx")

        pub_per_year_df.to_csv(path_pub_year, index=False)
        top_journals.to_csv(path_top_journals, index=False)
        top_authors.to_csv(path_top_authors, index=False)
        top_keywords.to_csv(path_top_keywords, index=False)
        kw_nodes.to_csv(path_kw_nodes, index=False)
        kw_edges.to_csv(path_kw_edges, index=False)
        auth_nodes.to_csv(path_auth_nodes, index=False)
        auth_edges.to_csv(path_auth_edges, index=False)

        writer = self._get_excel_writer(path_excel)
        if writer is None:
            df[["title","journal","year","authors","keywords","doi","url","country"]].to_csv(os.path.join(out_folder,"Articles_List.csv"), index=False)
            pub_per_year_df.to_csv(os.path.join(out_folder,"Publications_per_Year.csv"), index=False)
            top_journals.to_csv(os.path.join(out_folder,"Top_Journals.csv"), index=False)
            top_authors.to_csv(os.path.join(out_folder,"Top_Authors.csv"), index=False)
            top_keywords.to_csv(os.path.join(out_folder, "top_keywords.csv"), index=False)

            # Jika analisis negara berhasil, simpan ringkasan negara
            if country_summary is not None:
                country_summary.to_csv(
                    os.path.join(out_folder, "country_summary.csv"),
                    index=False
                )
            kw_nodes.to_csv(os.path.join(out_folder,"KW_Nodes.csv"), index=False)
            kw_edges.to_csv(os.path.join(out_folder,"KW_Edges.csv"), index=False)
            auth_nodes.to_csv(os.path.join(out_folder,"Author_Nodes.csv"), index=False)
            auth_edges.to_csv(os.path.join(out_folder,"Author_Edges.csv"), index=False)
            with open(os.path.join(out_folder,"README_NO_EXCEL.txt"),"w",encoding="utf-8") as f:
                f.write("xlsxwriter/openpyxl not available, XLSX summary not created. All sheets exported as CSV.\n")
        else:
            with writer:
                df[["title","journal","year","authors","keywords","doi","url","country"]].to_excel(writer, sheet_name="Articles List", index=False)
                pub_per_year_df.to_excel(writer, sheet_name="Publications per Year", index=False)
                top_journals.to_excel(writer, sheet_name="Top Journals", index=False)
                top_authors.to_excel(writer, sheet_name="Top Authors", index=False)
                top_keywords.to_excel(writer, sheet_name="Top Keywords", index=False)
                kw_nodes.to_excel(writer, sheet_name="KW Nodes", index=False)
                kw_edges.to_excel(writer, sheet_name="KW Edges", index=False)
                auth_nodes.to_excel(writer, sheet_name="Author Nodes", index=False)
                auth_edges.to_excel(writer, sheet_name="Author Edges", index=False)

        # Charts
        chart_pub_year = os.path.join(out_folder, "chart_publications_per_year.png")
        chart_top_journals = os.path.join(out_folder, "chart_top_journals.png")
        chart_top_authors = os.path.join(out_folder, "chart_top_authors.png")
        chart_top_keywords = os.path.join(out_folder, "chart_top_keywords.png")
        chart_kw_trends = os.path.join(out_folder, "chart_keyword_trends.png")
        chart_country_top10 = os.path.join(out_folder, "chart_top10_countries.png")
        trend_csv = os.path.join(out_folder, "keyword_trends.csv")

        if do_plots and not pub_per_year_df.empty:
            self._plot_line(pub_per_year_df["Year"], pub_per_year_df["Publications"], "Publications per Year", chart_pub_year, "Year", "Count")
        if do_plots and not top_journals.empty:
            self._plot_barh(top_journals.head(15), "Count", "Journal", "Top 15 Journals", chart_top_journals)
        if do_plots and not top_authors.empty:
            self._plot_barh(top_authors.head(15), "Count", "Author", "Top 15 Authors", chart_top_authors)
        if do_plots and not top_keywords.empty:
            self._plot_barh(top_keywords.head(15), "Frequency", "Keyword", "Top 15 Keywords", chart_top_keywords)
        # Top 10 negara berdasarkan affiliation (jika ada dan deteksi negara berhasil)
        if do_plots and (country_summary is not None) and (not country_summary.empty):
            try:
                top_countries_10 = country_summary.head(10)
                self._plot_barh(
                    top_countries_10,
                    "Count",
                    "Country",
                    "Top 10 Countries (Affiliation)",
                    chart_country_top10
                )
            except Exception as e:
                if feedback:
                    feedback.pushInfo(f"[WARN] Failed to create Top 10 Countries chart: {e}")

        # Keyword trends
        top10_kws = top_keywords["Keyword"].head(10).tolist()
        trend_rows = []
        if len(top10_kws) > 0:
            for k in top10_kws:
                for y, group in df.groupby("year"):
                    if pd.isna(y): continue
                    c = sum(1 for lst in group["kw_norm"] if k in lst)
                    trend_rows.append({"Keyword": k, "Year": int(y), "Count": c})
        trend_df = pd.DataFrame(trend_rows).sort_values(["Keyword","Year"])
        trend_df.to_csv(trend_csv, index=False)
        if do_plots and not trend_df.empty:
            plt.figure(figsize=(8, 4.5))
            for k in top10_kws[:5]:
                d = trend_df[trend_df["Keyword"] == k]
                if not d.empty: plt.plot(d["Year"], d["Count"], marker="o", label=k)
            plt.title("Top 5 Keyword Trends"); plt.xlabel("Year"); plt.ylabel("Article Count")
            plt.legend(); plt.grid(True, axis="y", linewidth=0.4); plt.tight_layout()
            plt.savefig(chart_kw_trends, dpi=160); plt.close()

        # Initial SLR + clustering
        def has_abstract(x): return isinstance(x, str) and len(x.strip()) > 50
        df_slr = df.copy()
        df_slr = df_slr[df_slr["year"].apply(lambda y: isinstance(y, (int, np.integer)) and year_min <= y <= year_max)]
        df_slr = df_slr[df_slr["abstract"].apply(has_abstract)]

        # ==== HARDEN kw_norm (Wajib) ====
        if "kw_norm" not in df_slr.columns:
            if "keywords" in df_slr.columns:
                df_slr["kw_norm"] = df_slr["keywords"].apply(self._clean_kw_list)
            else:
                df_slr["kw_norm"] = [[] for _ in range(len(df_slr))]
        else:
            df_slr["kw_norm"] = df_slr["kw_norm"].apply(
                lambda v: v if isinstance(v, list) else self._clean_kw_list(v)
            )

        # ==== SAFE-GET kw_norm untuk konstruksi teks (Wajib) ====
        def _get_kw_norm_safe(row):
            v = row.get("kw_norm", [])
            return v if isinstance(v, list) else self._clean_kw_list(v)

        df_slr["__text"] = df_slr.apply(
            lambda r: " ".join([
                str(r.get("title") or ""),
                " ".join(_get_kw_norm_safe(r)),
                str(r.get("abstract") or "")
            ]),
            axis=1
        )

        cluster_summary_path = os.path.join(out_folder, "slr_cluster_summary.csv")
        slr_candidates_path = os.path.join(out_folder, "slr_candidates_autoscreen.csv")
        prisma_path = os.path.join(out_folder, "slr_prisma_summary.json")

        cluster_quality_path = os.path.join(out_folder, "slr_cluster_quality.json")
        slr_preview_path = os.path.join(out_folder, "slr_clusters_preview.csv")

        try:
            from sklearn.feature_extraction.text import TfidfVectorizer
            from sklearn.cluster import KMeans, SpectralClustering
            from sklearn.metrics import silhouette_score

            texts = df_slr["__text"].fillna("").tolist()
            if len(texts) >= 10:
                vect = TfidfVectorizer(max_features=8000, ngram_range=(1,2), min_df=2, stop_words="english")
                X = vect.fit_transform(texts)

                n_docs = X.shape[0]
                n_clusters = 3 if n_docs < 64 else min(10, max(3, int(np.sqrt(n_docs)//8 + 3)))

                used_method, labels, km = "kmeans", None, None
                if cluster_method == "spectral" or (cluster_method == "auto" and n_docs >= 60):
                    try:
                        sc = SpectralClustering(
                            n_clusters=n_clusters,
                            affinity="nearest_neighbors",
                            n_neighbors=min(15, max(5, n_docs//20)),
                            assign_labels="kmeans",
                            random_state=42,
                        )
                        labels = sc.fit_predict(X)
                        used_method = "spectral"
                    except Exception:
                        km = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
                        labels = km.fit_predict(X)
                        used_method = "kmeans"
                else:
                    km = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
                    labels = km.fit_predict(X)
                    used_method = "kmeans"

                terms = np.array(vect.get_feature_names_out())
                try:
                    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
                    top_terms = {i: terms[order_centroids[i, :15]].tolist() for i in range(n_clusters)}
                except Exception:
                    top_terms = {}
                    for c in range(n_clusters):
                        idxs = np.where(labels == c)[0]
                        if len(idxs) == 0:
                            top_terms[c] = []
                            continue
                        mean_vec = X[idxs].mean(axis=0).A1
                        top_ids = np.argsort(mean_vec)[::-1][:15]
                        top_terms[c] = terms[top_ids].tolist()

                df_slr["cluster"] = labels
                df_slr["cluster_top_terms"] = df_slr["cluster"].apply(lambda c: ", ".join(top_terms.get(c, [])[:8]))
                df_slr["cluster_label"] = df_slr["cluster"].apply(lambda c: " / ".join(top_terms.get(c, [])[:3]))

                try:
                    sil = float(silhouette_score(X, labels, metric="cosine")) if n_clusters > 1 else None
                except Exception:
                    sil = None

                quality = {"method": used_method, "n_clusters": int(n_clusters), "silhouette_cosine": sil}
                with io.open(cluster_quality_path, "w", encoding="utf-8") as f:
                    json.dump(quality, f, ensure_ascii=False, indent=2)

                preview_rows = []
                for c in range(n_clusters):
                    sub = df_slr[df_slr["cluster"] == c].head(8)
                    for r in sub.itertuples():
                        preview_rows.append({
                            "cluster": c,
                            "cluster_label": r.cluster_label,
                            "title": r.title,
                            "year": r.year,
                            "journal": r.journal
                        })
                pd.DataFrame(preview_rows).to_csv(slr_preview_path, index=False)

            else:
                df_slr["cluster"] = 0
                df_slr["cluster_top_terms"] = ""
                df_slr["cluster_label"] = "theme"
                with io.open(cluster_quality_path, "w", encoding="utf-8") as f:
                    json.dump({"method":"none","n_clusters":1,"silhouette_cosine":None}, f, ensure_ascii=False, indent=2)

        except Exception:
            df_slr["cluster"] = 0
            df_slr["cluster_top_terms"] = ""
            df_slr["cluster_label"] = "theme"
            with io.open(cluster_quality_path, "w", encoding="utf-8") as f:
                json.dump({"method":"unavailable","n_clusters":1,"silhouette_cosine":None}, f, ensure_ascii=False, indent=2)

        # Pastikan kolom 'title' ada agar aman untuk ekspor nanti (walau kosong)
        if "title" not in df_slr.columns:
            df_slr["title"] = None

        # Gunakan .size() agar tidak bergantung pada kolom tertentu
        cluster_summary = (
            df_slr.groupby(["cluster", "cluster_label"], dropna=False)
                  .size()
                  .reset_index(name="Article Count")
                  .sort_values("Article Count", ascending=False)
        )

        cluster_summary.to_csv(cluster_summary_path, index=False)

        cols = ["title","journal","year","authors","keywords","doi","url","abstract",
                "cluster","cluster_label","cluster_top_terms"]
        # Pastikan kolom-kolom ini ada dulu
        for c in cols:
            if c not in df_slr.columns:
                df_slr[c] = None
        df_slr[cols].to_csv(slr_candidates_path, index=False)

        prisma = {
            "records_identified": int(len(df)),
            "duplicates_removed": int(0),
            "records_screened_title_abstract": int(len(df)),
            "records_excluded_no_abstract_or_out_of_year": int(len(df) - len(df_slr)),
            "studies_included_for_thematic_mapping": int(len(df_slr))
        }
        with io.open(prisma_path, "w", encoding="utf-8") as f:
            json.dump(prisma, f, ensure_ascii=False, indent=2)

        # === SLR SUMMARY to XLSX ===
        slr_xlsx = os.path.join(out_folder, "slr_summary.xlsx")
        slr_writer = self._get_excel_writer(slr_xlsx)
        if slr_writer is None:
            cluster_summary.to_csv(os.path.join(out_folder, "slr_cluster_summary.csv"), index=False)
            df_slr[cols].to_csv(os.path.join(out_folder, "slr_candidates_autoscreen.csv"), index=False)
            # also write cluster quality as a tiny CSV for convenience
            try:
                q = json.load(io.open(cluster_quality_path, "r", encoding="utf-8"))
                pd.DataFrame([q]).to_csv(os.path.join(out_folder, "slr_cluster_quality.csv"), index=False)
            except Exception:
                pass
            with open(os.path.join(out_folder,"README_NO_EXCEL.txt"),"a",encoding="utf-8") as f:
                f.write("slr_summary.xlsx was not created (Excel engine unavailable). Use the provided CSVs.\n")
        else:
            with slr_writer:
                cluster_summary.to_excel(slr_writer, sheet_name="Cluster Summary", index=False)
                # limit candidates to 1000 rows to keep file light
                df_slr[cols].head(1000).to_excel(slr_writer, sheet_name="Candidates (Top1000)", index=False)
                # cluster quality → small sheet
                try:
                    q = json.load(io.open(cluster_quality_path, "r", encoding="utf-8"))
                    pd.DataFrame([q]).to_excel(slr_writer, sheet_name="Quality", index=False)
                except Exception:
                    pd.DataFrame([{"method":"-", "n_clusters": "-", "silhouette_cosine": "-"}]).to_excel(slr_writer, sheet_name="Quality", index=False)

        # GAP FINDER (theme–method, theme–region, method–region)
        method_dict = self._load_method_dict(method_csv)
        region_dict = self._load_region_dict(region_csv)

        def pick_theme(row):
            if isinstance(row.get("cluster_label"), str) and row["cluster_label"].strip():
                return row["cluster_label"]
            if isinstance(row.get("cluster_top_terms"), str) and row["cluster_top_terms"].strip():
                return row["cluster_top_terms"].split(",")[0].strip()
            kws = row.get("kw_norm", [])
            if not isinstance(kws, list):
                kws = self._clean_kw_list(kws)
            return ", ".join(kws[:3]) if kws else "theme"
        df_slr["theme"] = df_slr.apply(pick_theme, axis=1)

        df_slr["__fulltext"] = df_slr.apply(
            lambda r: " ".join([
                str(r.get("title") or ""),
                str(r.get("abstract") or ""),
                " ".join(r.get("kw_norm", []) if isinstance(r.get("kw_norm", []), list) else self._clean_kw_list(r.get("kw_norm", [])))
            ]).lower(),
            axis=1
        )

        df_slr["methods_found"] = df_slr["__fulltext"].apply(lambda t: self._detect_methods(t, method_dict))
        df_slr["regions_found"] = df_slr["__fulltext"].apply(lambda t: self._detect_regions(t, region_dict))

        # --- Apply region filter (if any)
        if len(region_filter) > 0:
            def has_any_region(lst):
                s = set(lst or [])
                return any((lab in s) for lab in region_filter)
            before_n = len(df_slr)
            df_slr = df_slr[df_slr["regions_found"].apply(has_any_region)].copy()
            feedback.pushInfo(f"[Region Filter] Applied: {region_filter}. {before_n} → {len(df_slr)} articles passed the filter.")
            if df_slr.empty:
                feedback.pushInfo("[Region Filter] Empty result after region filter.")

        def explode_list(df_in, col_list):
            df_tmp = df_in.copy(); df_tmp = df_tmp.explode(col_list); return df_tmp

        df_tm = explode_list(df_slr[["theme","methods_found"]], "methods_found").dropna(subset=["methods_found"])
        pivot_tm = df_tm.pivot_table(index="theme", columns="methods_found", values="methods_found", aggfunc="count", fill_value=0) if not df_tm.empty else pd.DataFrame()

        df_tr = explode_list(df_slr[["theme","regions_found"]], "regions_found").dropna(subset=["regions_found"])
        pivot_tr = df_tr.pivot_table(index="theme", columns="regions_found", values="regions_found", aggfunc="count", fill_value=0) if not df_tr.empty else pd.DataFrame()

        df_mr = explode_list(df_slr[["methods_found","regions_found"]], "methods_found")
        df_mr = explode_list(df_mr, "regions_found")
        df_mr = df_mr.dropna(subset=["methods_found","regions_found"])
        pivot_mr = df_mr.pivot_table(index="methods_found", columns="regions_found", values="regions_found", aggfunc="count", fill_value=0) if not df_mr.empty else pd.DataFrame()

        path_pivot_tm = os.path.join(out_folder, "pivot_theme_method.csv")
        path_pivot_tr = os.path.join(out_folder, "pivot_theme_region.csv")
        path_pivot_mr = os.path.join(out_folder, "pivot_method_region.csv")
        (pivot_tm if not pivot_tm.empty else pd.DataFrame()).to_csv(path_pivot_tm)
        (pivot_tr if not pivot_tr.empty else pd.DataFrame()).to_csv(path_pivot_tr)
        (pivot_mr if not pivot_mr.empty else pd.DataFrame()).to_csv(path_pivot_mr)

        # --- Heatmaps for pivots (limit size for readability)
        pivot_tm_png = os.path.join(out_folder, "pivot_theme_method_heatmap.png")
        pivot_tr_png = os.path.join(out_folder, "pivot_theme_region_heatmap.png")
        pivot_mr_png = os.path.join(out_folder, "pivot_method_region_heatmap.png")

        if do_plots and not pivot_tm.empty:
            df_vis = pivot_tm.copy()
            if df_vis.shape[0] > heatmap_topn:
                top_rows = df_vis.sum(axis=1).sort_values(ascending=False).head(heatmap_topn).index
                df_vis = df_vis.loc[top_rows]
            if df_vis.shape[1] > heatmap_topn:
                top_cols = df_vis.sum(axis=0).sort_values(ascending=False).head(heatmap_topn).index
                df_vis = df_vis.loc[:, top_cols]
            self._plot_heatmap(df_vis, "Theme × Method", pivot_tm_png)

        if do_plots and not pivot_tr.empty:
            df_vis = pivot_tr.copy()
            if df_vis.shape[0] > heatmap_topn:
                top_rows = df_vis.sum(axis=1).sort_values(ascending=False).head(heatmap_topn).index
                df_vis = df_vis.loc[top_rows]
            if df_vis.shape[1] > heatmap_topn:
                top_cols = df_vis.sum(axis=0).sort_values(ascending=False).head(heatmap_topn).index
                df_vis = df_vis.loc[:, top_cols]
            self._plot_heatmap(df_vis, "Theme × Region", pivot_tr_png)

        if do_plots and not pivot_mr.empty:
            df_vis = pivot_mr.copy()
            if df_vis.shape[0] > heatmap_topn:
                top_rows = df_vis.sum(axis=1).sort_values(ascending=False).head(heatmap_topn).index
                df_vis = df_vis.loc[top_rows]
            if df_vis.shape[1] > heatmap_topn:
                top_cols = df_vis.sum(axis=0).sort_values(ascending=False).head(heatmap_topn).index
                df_vis = df_vis.loc[:, top_cols]
            self._plot_heatmap(df_vis, "Method × Region", pivot_mr_png)

        # Safe gap identification
        gaps = []
        def collect_gaps_from_pivot(pivot_df, tag):
            if pivot_df is None or pivot_df.empty: return
            for r in list(pivot_df.index):
                for c in list(pivot_df.columns):
                    try: val = int(pivot_df.loc[r, c])
                    except Exception: continue
                    if val <= min_count_gap:
                        entry = {"tipe_matriks": tag, "tema": None, "metode": None, "wilayah": None, "jumlah": val}
                        if tag == "tema_metode": entry["tema"], entry["metode"] = r, c
                        elif tag == "tema_wilayah": entry["tema"], entry["wilayah"] = r, c
                        else: entry["metode"], entry["wilayah"] = r, c
                        gaps.append(entry)

        collect_gaps_from_pivot(pivot_tm, "tema_metode")
        collect_gaps_from_pivot(pivot_tr, "tema_wilayah")
        collect_gaps_from_pivot(pivot_mr, "metode_wilayah")

        if len(gaps) > 0:
            gaps_df = pd.DataFrame(gaps, columns=["tipe_matriks","tema","metode","wilayah","jumlah"]).sort_values(["tipe_matriks","jumlah"], ascending=[True, True])
        else:
            gaps_df = pd.DataFrame(columns=["tipe_matriks","tema","metode","wilayah","jumlah"])

        path_gaps_csv = os.path.join(out_folder, "gap_candidates.csv")
        gaps_df.to_csv(path_gaps_csv, index=False)

        if not gaps_df.empty:
            vmax = max(1, int(gaps_df["jumlah"].max()))
            gaps_df["skor_kekosongan"] = 1.0 - (gaps_df["jumlah"].astype(float) / float(vmax))
            gaps_df["skor"] = w_urg*1.0 + w_emp*gaps_df["skor_kekosongan"] + w_fea*1.0
            gaps_df = gaps_df.sort_values("skor", ascending=False)

        # Gap Excel (fallback)
        gap_xlsx = os.path.join(out_folder, "gap_report.xlsx")
        writer = self._get_excel_writer(gap_xlsx)
        if writer is None:
            (pivot_tm if not pivot_tm.empty else pd.DataFrame()).to_csv(os.path.join(out_folder, "pivot_theme_method.csv"))
            (pivot_tr if not pivot_tr.empty else pd.DataFrame()).to_csv(os.path.join(out_folder, "pivot_theme_region.csv"))
            (pivot_mr if not pivot_mr.empty else pd.DataFrame()).to_csv(os.path.join(out_folder, "pivot_method_region.csv"))
            if not gaps_df.empty:
                gaps_df.to_csv(os.path.join(out_folder, "gap_priorities.csv"), index=False)
            with open(os.path.join(out_folder, "README_NO_EXCEL.txt"), "a", encoding="utf-8") as f:
                f.write("gap_report.xlsx was not created (Excel engine unavailable). Use the provided CSVs.\n")
        else:
            with writer:
                (pivot_tm if not pivot_tm.empty else pd.DataFrame()).to_excel(writer, sheet_name="Theme x Method")
                (pivot_tr if not pivot_tr.empty else pd.DataFrame()).to_excel(writer, sheet_name="Theme x Region")
                (pivot_mr if not pivot_mr.empty else pd.DataFrame()).to_excel(writer, sheet_name="Method x Region")
                if not gaps_df.empty:
                    gaps_df.to_excel(writer, sheet_name="Priority Gaps", index=False)

        # Gap Markdown
        gap_md = os.path.join(out_folder, "gap_report.md")
        with io.open(gap_md, "w", encoding="utf-8") as f:
            f.write("# Gap Report\n\n")
            f.write("Matrices for Theme x Method, Theme x Region, and Method x Region have been created.\n\n")
            f.write(f"Gap threshold: article count <= {min_count_gap}.\n\n")
            if not gaps_df.empty:
                f.write("## Top 20 Priority Gaps\n\n")
                for i, row in enumerate(gaps_df.head(20).itertuples(), 1):
                    if row.tipe_matriks == "tema_metode":
                        desc = f"Theme {row.tema} with Method {row.metode}"
                    elif row.tipe_matriks == "tema_wilayah":
                        desc = f"Theme {row.tema} in Region {row.wilayah}"
                    else:
                        desc = f"Method {row.metode} in Region {row.wilayah}"
                    f.write(f"{i}. {desc}. Current count {row.jumlah}. Score {getattr(row,'skor', float('nan')):.3f}\n")
            else:
                f.write("No gaps detected at the current threshold or pivots are empty.\n")

        # --- Info log to user ---
        feedback.pushInfo("=== Bibliometrics + SLR + Gap Finder finished ===")
        feedback.pushInfo(f"Output folder: {out_folder}")
        if region_filter:
            feedback.pushInfo(f"- Region filter active: {region_filter}")
        feedback.pushInfo("Main files:")
        feedback.pushInfo(f"- Bibliometric summary (Excel/CSV): {path_excel}")
        feedback.pushInfo(f"- SLR summary (XLSX/CSV): {slr_xlsx}")
        feedback.pushInfo(f"- Gap report (Excel/CSV+MD): {gap_xlsx}, {gap_md}")
        feedback.pushInfo(f"- Publications per year: {path_pub_year}")
        feedback.pushInfo(f"- Top journals: {path_top_journals}")
        feedback.pushInfo(f"- Top authors: {path_top_authors}")
        feedback.pushInfo(f"- Top keywords: {path_top_keywords}")
        feedback.pushInfo(f"- Keyword network: {path_kw_nodes}, {path_kw_edges}")
        feedback.pushInfo(f"- Author network: {path_auth_nodes}, {path_auth_edges}")
        feedback.pushInfo(f"- Keyword co-occ matrix: {path_kw_mat}")
        feedback.pushInfo(f"- Keyword co-occ heatmap: {path_kw_mat_png if path_kw_mat_png else 'NA'}")
        feedback.pushInfo(f"- Keyword GAP pairs: {path_kw_gap}")
        feedback.pushInfo(f"- Keyword GAP matrix: {path_kw_gap_mat}")
        feedback.pushInfo(
            f"[Plot] Co-occurrence heatmap: {path_kw_mat_png if path_kw_mat_png else 'NA'} | "
            f"shape={getattr(df_mat, 'shape', 'NA')}"
        )
        feedback.pushInfo(
            f"[Plot] GAP heatmap: {path_kw_gap_heatmap if path_kw_gap_heatmap else 'NA'} | "
            f"shape={df_gap_mat.shape if isinstance(df_gap_mat, pd.DataFrame) else 'NA'}"
        )
        feedback.pushInfo("Pivots & Heatmaps:")
        feedback.pushInfo(f"  {path_pivot_tm} | {pivot_tm_png}")
        feedback.pushInfo(f"  {path_pivot_tr} | {pivot_tr_png}")
        feedback.pushInfo(f"  {path_pivot_mr} | {pivot_mr_png}")
        feedback.pushInfo(f"- SLR cluster quality: {cluster_quality_path}")
        feedback.pushInfo(f"- SLR clusters preview: {slr_preview_path}")
        feedback.pushInfo(f"- Gap candidates: {path_gaps_csv}")
        feedback.pushInfo(f"- Keyword network PNG: {keyword_net_png if keyword_net_png else 'NA'}")
        feedback.pushInfo(f"- Keyword centrality CSV: {keyword_centrality_csv if keyword_centrality_csv else 'NA'}")
        feedback.pushInfo(f"- Author network PNG: {author_net_png if author_net_png else 'NA'}")
        feedback.pushInfo(f"- Author centrality CSV: {author_centrality_csv if author_centrality_csv else 'NA'}")
        feedback.pushInfo(f"- Chart top 10 countries (affiliation): {chart_country_top10 if (country_summary is not None) else 'NA'}")

        feedback.pushInfo("=============================================")
        # Optional: buat layer titik negara jika user menentukan output sink
        try:
            if country_summary is not None:
                fields = QgsFields()
                fields.append(QgsField("Country", QVariant.String))
                fields.append(QgsField("Count", QVariant.Int))

                crs = QgsCoordinateReferenceSystem("EPSG:4326")

                sink, dest_id = self.parameterAsSink(
                    parameters,
                    self.PARAM_COUNTRY_POINTS,
                    context,
                    fields,
                    QgsWkbTypes.Point,
                    crs
                )

                if sink is not None:
                    coords_map = country_coords or self._country_coord_dict()
                    for _, row in country_summary.iterrows():
                        cname = str(row["Country"])
                        cnt = int(row["Count"])
                        if cname in coords_map:
                            lon, lat = coords_map[cname]
                            fet = QgsFeature()
                            fet.setFields(fields)
                            fet.setAttribute("Country", cname)
                            fet.setAttribute("Count", int(cnt))
                            pt = QgsPointXY(float(lon), float(lat))
                            fet.setGeometry(QgsGeometry.fromPointXY(pt))
                            sink.addFeature(fet)
        except Exception:
            # Kalau gagal, jangan hentikan algoritma utama
            pass

        # Return outputs (OUTPUT_FOLDER shows in Result Viewer)
        return {
            "OUTPUT_FOLDER": out_folder,
            "total_records": total_records,
            "pub_per_year_csv": path_pub_year,
            "top_journals_csv": path_top_journals,
            "top_authors_csv": path_top_authors,
            "top_keywords_csv": path_top_keywords,
            "kw_nodes_csv": path_kw_nodes,
            "kw_edges_csv": path_kw_edges,
            "author_nodes_csv": path_auth_nodes,
            "author_edges_csv": path_auth_edges,
            "excel_summary": path_excel,
            "slr_summary_xlsx": slr_xlsx,
            "chart_publications_per_year": chart_pub_year,
            "chart_top_journals": chart_top_journals,
            "chart_top_authors": chart_top_authors,
            "chart_top_keywords": chart_top_keywords,
            "chart_keyword_trends": chart_kw_trends,
            "keyword_trends_csv": trend_csv,
            "slr_cluster_summary_csv": cluster_summary_path,
            "slr_candidates_csv": slr_candidates_path,
            "slr_prisma_json": prisma_path,
            "slr_cluster_quality_json": cluster_quality_path,
            "slr_clusters_preview_csv": slr_preview_path,
            "pivot_theme_method_csv": path_pivot_tm,
            "pivot_theme_region_csv": path_pivot_tr,
            "pivot_method_region_csv": path_pivot_mr,
            "gap_candidates_csv": path_gaps_csv,
            "gap_report_xlsx": gap_xlsx,
            "gap_report_md": gap_md,
            "keyword_gap_pairs_csv": path_kw_gap,
            "keyword_cooccurrence_matrix_csv": path_kw_mat,
            "keyword_cooccurrence_heatmap_png": path_kw_mat_png if path_kw_mat_png else "",
            "keyword_gap_matrix_csv": path_kw_gap_mat,
            "keyword_gap_heatmap_png": path_kw_gap_heatmap if path_kw_gap_heatmap else "",
            "pivot_theme_method_heatmap_png": pivot_tm_png if os.path.exists(pivot_tm_png) else "",
            "pivot_theme_region_heatmap_png": pivot_tr_png if os.path.exists(pivot_tr_png) else "",
            "pivot_method_region_heatmap_png": pivot_mr_png if os.path.exists(pivot_mr_png) else "",
            "region_filter_applied": ", ".join(region_filter) if region_filter else "",
            "keyword_network_png": keyword_net_png if keyword_net_png else "",
            "author_network_png": author_net_png if author_net_png else "",
            "keyword_network_centrality_csv": keyword_centrality_csv if keyword_centrality_csv else "",
            "author_network_centrality_csv": author_centrality_csv if author_centrality_csv else "",
            "chart_top10_countries_png": chart_country_top10 if (country_summary is not None) else "",
        }
