from shapely.geometry import shape
import pandas as pd
from osgeo import gdal, ogr
import requests
from concurrent.futures import ThreadPoolExecutor
import time


# Diccionario de mapeo de atributos para muestreo hidrobio marino
mapeo = {
    "ID_MUESTRA": "materialSampleID",
    "LABORAT": "recordedBy",
    "FEC_ANALIS": "dateIdentified",
    #"INS_MUEST": "samplingProtocol",
    "T_ESF_MUES": "samplingEffort",
    "TAM_MUEST": "sampleSizeValue",
    "PROFUND_m": "verbatimDepth",
    #"HIDROBIOTA": "vernacularName",
    "ESPECIE": "scientificName", 
    "CLASE": "class",
    "ORDEN": "order",
    "FAMILIA": "family",
    "GENERO": "genus",
    "N_COMUN": "vernacularName",
    "DENS_CAMTI": "organismQuantity",
    "UNID_DENS": "organismQuantityType",
    "taxonRank": "taxonRank",
    "canonicalName": "acceptedNameUsage",
    "scientificNameAuthorship": "scientificNameAuthorship",
    "taxonomicStatus": "taxonomicStatus",
    "scientificNameID": "scientificNameID",
    "nameAccordingTo": "nameAccordingTo",
    "nameAccordingToID": "nameAccordingToID",
    "PROYECTO": "datasetName"

}

# Valores constantes para muestreo hidrobio marino
valores_constantes = {
    "type": "Event",
    "basisOfRecord": "HumanObservation",
    "occurrenceStatus": "present",
    "language": "es",
    "continent": "América del Sur",
    "country": "Colombia",
    "countryCode": "CO",
  
}

# Lista completa de atributos en el archivo final
lista_atributos = [
    "id", "type", "language", "institutionID", "institutionCode", "datasetName",
    "basisOfRecord", "occurrenceID", "recordedBy", "recordNumber", "individualCount",
    "occurrenceStatus", "occurrenceRemarks", "organismRemarks", "organismQuantity", "organismQuantityType", "eventID",
    "parentEventID", "fieldNumber", "materialSampleID", "eventDate", "year", "month", "day", "eventTime",
    "verbatimEventDate", "dateIdentified", "habitat", "samplingProtocol", "samplingEffort", "sampleSizeValue", "sampleSizeUnit",
    "eventRemarks", "continent", "waterBody", "country",
    "countryCode",  "verbatimDepth", "verbatimIdentification", "identificationQualifier", "scientificName",
    "acceptedNameUsage", "higherClassification", "kingdom", "phylum", "class",
    "order", "family", "genus", "specificEpithet", "infraspecificEpithet",
    "taxonRank",  "scientificNameAuthorship", "vernacularName",
    "taxonomicStatus","scientificNameID","nameAccordingTo", "nameAccordingToID", 
    "measurementValue", "measurementType", "measurementTypeID", "measurementUnitID",
    "measurementValue_1", "measurementType_1", "measurementUnit_1",
    "measurementValue_2", "measurementType_2", "measurementUnit_2",
    "measurementValue_3", "measurementType_3", "measurementUnit_3",
    "measurementValue_4", "measurementType_4", "measurementUnit_4"
    ]



def consultar_taxon_rank(especie):
    url = f"http://www.marinespecies.org/rest/AphiaRecordsByMatchNames?scientificnames[]={especie}&marine_only=false"

    try:
        time.sleep(1)  
        response = requests.get(url)

        if response.status_code == 200:
            data = response.json()
            especie_data = data[0][0] if data and data[0] else {}

            return {
                "taxonRank": especie_data.get("rank", None),
                "canonicalName": especie_data.get("valid_name", None),
                "scientificNameAuthorship": especie_data.get("authority", None),
                "taxonomicStatus": especie_data.get("status", None),
                "scientificNameID": f"https://www.marinespecies.org/aphia.php?p=taxdetails&id={especie_data.get('AphiaID')}" if especie_data.get("AphiaID") else None,
                "nameAccordingTo": "World Register of Marine Species",
                "nameAccordingToID": "https://www.marinespecies.org"
            }

        else:
            
            return None

    except Exception as e:

        return None


def agregar_datos_api_a_excel(ruta_excel_hidrobio):
    try:

        df = pd.read_excel(ruta_excel_hidrobio)

        if "ESPECIE" not in df.columns:
            raise ValueError("❌ ERROR: La columna 'ESPECIE' no está en el archivo Excel.")


        especies = df["ESPECIE"].tolist()

       

        # Ejecutar en paralelo las consultas a la API
        with ThreadPoolExecutor(max_workers=50) as executor:
            resultados = list(executor.map(consultar_taxon_rank, especies))



        # Convertir a DataFrame (sanitizar los resultados)
        datos_api = pd.DataFrame([
            r if isinstance(r, dict) else {} for r in resultados
        ])



        # Agregar columnas nuevas al DataFrame original
        df["taxonRank"] = datos_api.get("taxonRank", None)
        df["canonicalName"] = datos_api.get("canonicalName", None)
        df["scientificNameAuthorship"] = datos_api.get("scientificNameAuthorship", None)
        df["taxonomicStatus"] = datos_api.get("taxonomicStatus", None)
        df["scientificNameID"] = datos_api.get("scientificNameID", None)
        df["nameAccordingTo"] = datos_api.get("nameAccordingTo", None)
        df["nameAccordingToID"] = datos_api.get("nameAccordingToID", None)


        df.to_excel(ruta_excel_hidrobio, index=False)


    except Exception as e:
        return None



# Función para exportar a Excel
def exportar_excel(dataframe, ruta_salida):
    try:

        dataframe.to_excel(ruta_salida, index=False)

    except Exception as e:
        return None



# Función para procesar campos específicos
def procesar_campos_especificos(df):
    """Calcula los campos en orden secuencial asegurando que cada campo esté disponible antes de ser usado."""

    if df is None or df.empty:

        return df  

    # Mapear ID_PUNTO_M a id y eventID
    if "ID_PUNTO_M" in df.columns:
        df["id"] = df["ID_PUNTO_M"]
        df["eventID"] = df["ID_PUNTO_M"]
    else:
        df["id"] = None
        df["eventID"] = None

    # Mapear FEC_TOM a verbatimEventDate
    if "FEC_TOM" in df.columns:
        df["verbatimEventDate"] = df["FEC_TOM"]
    else:
        df["verbatimEventDate"] = None
    
    # Calcular `eventDate`, `year`, `month`, `day` y conservamos `verbatimEventDate`
    if "verbatimEventDate" in df.columns:

        df["eventDate"] = pd.to_datetime(df["verbatimEventDate"], errors='coerce').dt.strftime('%Y-%m-%d')
        df["year"] = pd.to_datetime(df["verbatimEventDate"], errors='coerce').dt.year
        df["month"] = pd.to_datetime(df["verbatimEventDate"], errors='coerce').dt.month
        df["day"] = pd.to_datetime(df["verbatimEventDate"], errors='coerce').dt.day

    else:
        df["eventDate"] = df["year"] = df["month"] = df["day"] = None

   # Mapear HORA a eventTime (formato HH:MM:SS)
    if "HORA" in df.columns:
        df["eventTime"] = pd.to_timedelta(df["HORA"], unit="h").dt.components.apply(
            lambda x: f"{int(x.hours):02}:{int(x.minutes):02}:{int(x.seconds):02}", axis=1
        )
    else:
        df["eventTime"] = None

    # Mapear PHYLUM y DIVISION a phyllum
    if "PHYLLUM" in df.columns and "DIVISION" in df.columns:
        df["phylum"] = df.apply(
            lambda row: (
                str(row["PHYLLUM"]) if pd.notna(row["PHYLLUM"]) and pd.isna(row["DIVISION"])
                else str(row["DIVISION"]) if pd.notna(row["DIVISION"]) and pd.isna(row["PHYLLUM"])
                else f"{row['PHYLLUM']} | {row['DIVISION']}" if pd.notna(row["PHYLLUM"]) and pd.notna(row["DIVISION"])
                else None
            ),
            axis=1
        )


    # Mapeo temporal de los dominios de Hidrobiota
    if "HIDROBIOTA" in df.columns:
        df["HIDROBIOTA"] = df["HIDROBIOTA"].fillna(0).astype(float).astype(int).astype(str).replace({"0", ""})

    mapa_hidrobiota = {
        "120201": "Macroinvertebrados (Bentos)",
        "120202": "Perifiton",
        "120203": "Zooplancton",
        "120204": "Fitoplancton",
        "120205": "Macrófitas",
        "120206": "Meiofauna",
        "120207": "Ictioplancton",
        "120208": "Macrofauna"
    }
    df["temp_HIDROBIOTA"] = df["HIDROBIOTA"].map(mapa_hidrobiota).fillna("")
    # Mapear HIDROBIOTA y N_COMUN a vernacularName
    if "temp_HIDROBIOTA" in df.columns and "N_COMUN" in df.columns:
        df["vernacularName"] = df.apply(
            lambda row: (
                str(row["temp_HIDROBIOTA"]) if pd.notna(row["temp_HIDROBIOTA"]) and pd.isna(row["N_COMUN"]) else
                str(row["N_COMUN"]) if pd.notna(row["N_COMUN"]) and pd.isna(row["temp_HIDROBIOTA"]) else
                f"{row['temp_HIDROBIOTA']} | {row['N_COMUN']}" if pd.notna(row["temp_HIDROBIOTA"]) and pd.notna(row["N_COMUN"]) else
                None
            ),
            axis=1
        )


    # Mapear higherClassification 
    campos_clasificacion = [
        "PHYLLUM", "SUBPHYLUM", "DIVISION", "CLASE", "SUBCLASE", "ORDEN", "FAMILIA", "GENERO"
    ]

    for col in campos_clasificacion:
        if col not in df.columns:
            df[col] = None

    df["higherClassification"] = df[campos_clasificacion]\
        .applymap(lambda x: x if pd.notna(x) and str(x).strip().lower() not in ["", "none", "nan", "<null>"] else pd.NA)\
        .apply(lambda row: " | ".join(row.dropna().astype(str)), axis=1)


    # Mapear a scientificName con validación
    df["scientificName"] = df.apply(
        lambda row: (
            row["ESPECIE"]
            if pd.notna(row["ESPECIE"]) and str(row["ESPECIE"]).strip().lower() not in ["", "none", "nan", "<null>"]
            else (
                row["higherClassification"].split(" | ")[-1]
                if pd.notna(row["higherClassification"]) and " | " in row["higherClassification"]
                else row["higherClassification"]
            )
        ),
        axis=1
    )


    # Mapeo de INS_MUEST a samplingProtocol
    if "INS_MUEST" in df.columns:
        mapeo_protocol = {
            501: "ADCP",
            502: "Botella Go Flo",
            503: "Botella Nansen",
            504: "Botella Niskin",
            505: "Botella Rountner",
            506: "Box corer",
            507: "Censo visual",
            508: "CTD",
            509: "CTDO",
            510: "Cuadrante",
            511: "Draga",
            512: "Draga Van Veen",
            513: "HADCP",
            514: "LADCP",
            515: "Nasas",
            516: "Nucleador (Corazonador)",
            517: "Observación",
            518: "Otro",
            519: "Palangre horizontal",
            520: "Palangre vertical",
            521: "Piston corer",
            522: "Recolección directa",
            523: "Red cónica",
            524: "Red de arrastre",
            525: "Red de encierro",
            526: "Red de enmalle",
            527: "Transecto con punto intercepto",
            528: "Transectos con cadena intercepto",
            529: "Transectos en línea"
        }
        df["samplingProtocol"] = df["INS_MUEST"].map(mapeo_protocol).fillna("")



    # Mapear y Verificar PROF_SECCH a measurementValue
    if "PROF_SECCH" in df.columns and df["PROF_SECCH"].dropna().empty is False:
        df["measurementValue_"] = df["PROF_SECCH"]
        df["measurementType"] = df["measurementValue"].apply(lambda x: "Profundidad de disco secchi" if pd.notna(x) else None)
        df["measurementUnit"] = df["measurementValue"].apply(lambda x: "m" if pd.notna(x) else None)
        df["measurementTypeID"] = df["measurementValue"].apply(lambda x: "https://vocab.nerc.ac.uk/collection/P02/current/SECC/" if pd.notna(x) else None)
        df["measurementUnitID"] = df["measurementValue"].apply(lambda x: "https://vocab.nerc.ac.uk/collection/P06/current/ULAA/" if pd.notna(x) else None)

    else:
        df["measurementValue"] = df["measurementType"] = df["measurementUnit"] = df["measurementTypeID"] = df["measurementUnitID"] = None


    # Mapear BIOM_HUM
    if "BIOM_HUM" in df.columns and df["BIOM_HUM"].dropna().empty is False:
        df["measurementValue_1"] = df["BIOM_HUM"]
        df["measurementType_1"] = df["measurementValue_1"].apply(
            lambda x: "Biomasa húmeda por área o volumen" if pd.notna(x) else None
        )
        df["measurementUnit_1"] = df.apply(
            lambda row: row["UNIDAD_BH"] if pd.notna(row["measurementValue_1"]) else None,
            axis=1
        )
    else:
        df["measurementValue_1"] = df["measurementType_1"] = df["measurementUnit_1"] = None


    # Mapear BIOM_SEC
    if "BIOM_SEC" in df.columns and df["BIOM_SEC"].dropna().empty is False:
        df["measurementValue_2"] = df["BIOM_SEC"]
        df["measurementType_2"] = df["measurementValue_2"].apply(
            lambda x: "Biomasa seca por área o volumen" if pd.notna(x) else None
        )
        df["measurementUnit_2"] = df.apply(
            lambda row: row["UNIDAD_BS"] if pd.notna(row["measurementValue_2"]) else None,
            axis=1
        )
    else:
        df["measurementValue_2"] = df["measurementType_2"] = df["measurementUnit_2"] = None

    # Mapear MO
    if "MO" in df.columns and df["MO"].dropna().empty is False:
        df["measurementValue_3"] = df["MO"]
        df["measurementType_3"] = df["measurementValue_3"].apply(
            lambda x: "Biomasa seca por área o volumen" if pd.notna(x) else None
        )
        df["measurementUnit_3"] = df.apply(
            lambda row: row["UNIDAD_MO"] if pd.notna(row["measurementValue_3"]) else None,
            axis=1
        )
    else:
        df["measurementValue_3"] = df["measurementType_3"] = df["measurementUnit_3"] = None

    # Mapear BIOM_VOL
    if "BIOM_VOL" in df.columns and df["BIOM_VOL"].dropna().empty is False:
        df["measurementValue_4"] = df["BIOM_VOL"]
        df["measurementType_4"] = df["measurementValue_4"].apply(
            lambda x: "Biomasa seca por área o volumen" if pd.notna(x) else None
        )
        df["measurementUnit_4"] = df.apply(
            lambda row: row["UNIDAD_BV"] if pd.notna(row["measurementValue_4"]) else None,
            axis=1
        )
    else:
        df["measurementValue_4"] = df["measurementType_4"] = df["measurementUnit_4"] = None

    return df





# Función principal para procesar muestreio hidrobio marino
def procesar_hidrobio(ruta_gdb, tabla_hidrobio,  ruta_excel_hidrobio, archivo_entrada_hidrobio, archivo_salida_hidrobio):
    try:
        # Abrir la Geodatabase
        gdb = gdal.OpenEx(ruta_gdb, gdal.OF_VECTOR)
        if not gdb:
            raise RuntimeError(f"❌ No se pudo abrir la GDB en {ruta_gdb}")


        # Extraer atributos de la primera tabla
        datos_tabla = []
        layer_1 = gdb.GetLayerByName(tabla_hidrobio)
        for feature in layer_1:
            datos_tabla.append(feature.items())  

        resultado = pd.DataFrame(datos_tabla).replace("<Null>", "")

        # Exportar el resultado del join a un archivo Excel intermedio
        exportar_excel(resultado, ruta_excel_hidrobio)

        # Leer el archivo Excel intermedio y agregar taxonRank
        agregar_datos_api_a_excel(ruta_excel_hidrobio)

        # Leer el archivo Excel con taxonRank agregado
        df_intermedio = pd.read_excel(ruta_excel_hidrobio)
       
        # Procesar campos específicos
        df_intermedio = procesar_campos_especificos(df_intermedio)

        # Crear DataFrame final con todos los atributos de lista_atributos
        df_final = pd.DataFrame(columns=lista_atributos)

        # Mapear los datos del DataFrame intermedio al DataFrame final
        for columna_intermedia, columna_final in mapeo.items():
            if columna_intermedia in df_intermedio.columns:
                df_final[columna_final] = df_intermedio[columna_intermedia]

        # Agregar valores constantes
        for clave, valor in valores_constantes.items():
            df_final[clave] = valor

        # Agregar los campos calculados
       
        df_final["id"] = df_intermedio["id"]
        df_final["eventID"] = df_intermedio["eventID"]
        df_final["verbatimEventDate"] = df_intermedio["verbatimEventDate"]
        df_final["eventDate"] = df_intermedio["eventDate"]
        df_final["year"] = df_intermedio["year"]            
        df_final["month"] = df_intermedio["month"]
        df_final["day"] = df_intermedio["day"]
        df_final["eventTime"] = df_intermedio["eventTime"]
        df_final["phylum"] = df_intermedio["phylum"]
        df_final["vernacularName"] = df_intermedio["vernacularName"]
        df_final["higherClassification"] = df_intermedio["higherClassification"]
        df_final["scientificName"] = df_intermedio["scientificName"]
        df_final["samplingProtocol"] = df_intermedio["samplingProtocol"]
        df_final["measurementValue"] = df_intermedio["measurementValue"]
        df_final["measurementType"] = df_intermedio["measurementType"]
        df_final["measurementUnit"] = df_intermedio["measurementUnit"]
        df_final["measurementTypeID"] = df_intermedio["measurementTypeID"]
        df_final["measurementUnitID"] = df_intermedio["measurementUnitID"]
        
        df_final["measurementValue_1"] = df_intermedio["measurementValue_1"]
        df_final["measurementType_1"] = df_intermedio["measurementType_1"]
        df_final["measurementUnit_1"] = df_intermedio["measurementUnit_1"]
        df_final["measurementValue_2"] = df_intermedio["measurementValue_2"]
        df_final["measurementType_2"] = df_intermedio["measurementType_2"]
        df_final["measurementUnit_2"] = df_intermedio["measurementUnit_2"]
        df_final["measurementValue_3"] = df_intermedio["measurementValue_3"]
        df_final["measurementType_3"] = df_intermedio["measurementType_3"]
        df_final["measurementUnit_3"] = df_intermedio["measurementUnit_3"]
        df_final["measurementValue_4"] = df_intermedio["measurementValue_4"]
        df_final["measurementType_4"] = df_intermedio["measurementType_4"]
        df_final["measurementUnit_4"] = df_intermedio["measurementUnit_4"]

        measurement_columns_to_drop  = []

        for col in df_final.columns:
            if col.startswith("measurementValue_"):
                suf = col.split("_")[-1] 
                v = f"measurementValue_{suf}"
                t = f"measurementType_{suf}"
                u = f"measurementUnit_{suf}"

                serie = df_final[v].astype("string")

                vacio = (
                    serie.isna() |
                    (serie.str.strip() == "")
                ).all()

                if vacio:
                    measurement_columns_to_drop .extend([v, t, u])

        
        measurement_columns_to_drop  = [c for c in measurement_columns_to_drop  if c in df_final.columns]
        if measurement_columns_to_drop :
            df_final.drop(columns=measurement_columns_to_drop , inplace=True)


        # Exportar el DataFrame final a un archivo Excel
        exportar_excel(df_final, archivo_salida_hidrobio)

    except Exception as e:
        return None
