import re
import PyPDF2
import docx
import spacy

nlp = spacy.load("en_core_web_sm")

SKILL_KEYWORDS = [
    "python", "javascript", "typescript", "java", "c++", "c#", "go", "rust",
    "react", "vue", "angular", "next.js", "node.js", "express", "django", "flask",
    "fastapi", "spring", "html", "css", "tailwind", "bootstrap",
    "mongodb", "postgresql", "mysql", "redis", "sqlite",
    "docker", "kubernetes", "aws", "gcp", "azure", "git", "linux",
    "machine learning", "deep learning", "tensorflow", "pytorch", "pandas", "numpy",
    "sql", "graphql", "rest api", "figma", "photoshop", "blender",
]

def extract_text_from_pdf(file_path: str) -> str:
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

def extract_text_from_docx(file_path: str) -> str:
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_email(text: str) -> str:
    match = re.search(r"[\w\.-]+@[\w\.-]+\.\w+", text)
    return match.group(0) if match else ""

def extract_phone(text: str) -> str:
    match = re.search(r"[\+\(]?[0-9][0-9\s\-\(\)]{7,}[0-9]", text)
    return match.group(0).strip() if match else ""

def extract_name(text: str) -> str:
    doc = nlp(text[:500])  # name is usually in the first few lines
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return ent.text
    # fallback: first non-empty line
    for line in text.split("\n"):
        if line.strip():
            return line.strip()
    return ""

def extract_skills(text: str) -> list:
    text_lower = text.lower()
    found = []
    for skill in SKILL_KEYWORDS:
        if skill in text_lower:
            found.append(skill.title() if len(skill) <= 4 else skill.capitalize())
    return list(set(found))

def extract_section(text: str, section_name: str, next_sections: list) -> str:
    pattern = re.compile(
        rf"(?i){section_name}[:\s]*\n(.*?)(?={'|'.join(next_sections)}|\Z)",
        re.DOTALL
    )
    match = pattern.search(text)
    return match.group(1).strip() if match else ""

def extract_projects(text: str) -> list:
    section = extract_section(text, "projects", ["experience", "education", "certifications", "skills"])
    if not section:
        return []
    projects = []
    for block in re.split(r"\n{2,}", section):
        lines = [l.strip() for l in block.split("\n") if l.strip()]
        if lines:
            projects.append({
                "name": lines[0],
                "description": " ".join(lines[1:3]),
                "techStack": extract_skills(" ".join(lines)),
            })
    return projects[:6]

def extract_experience(text: str) -> list:
    section = extract_section(text, "experience", ["education", "projects", "certifications", "skills"])
    if not section:
        return []
    experience = []
    for block in re.split(r"\n{2,}", section):
        lines = [l.strip() for l in block.split("\n") if l.strip()]
        if len(lines) >= 2:
            experience.append({
                "company": lines[0],
                "role": lines[1] if len(lines) > 1 else "",
                "duration": lines[2] if len(lines) > 2 else "",
                "bullets": lines[3:6],
            })
    return experience[:5]

def extract_education(text: str) -> list:
    section = extract_section(text, "education", ["experience", "projects", "skills"])
    if not section:
        return []
    education = []
    for block in re.split(r"\n{2,}", section):
        lines = [l.strip() for l in block.split("\n") if l.strip()]
        if lines:
            year_match = re.search(r"\b(19|20)\d{2}\b", " ".join(lines))
            education.append({
                "institution": lines[0],
                "degree": lines[1] if len(lines) > 1 else "",
                "field": "",
                "year": int(year_match.group(0)) if year_match else None,
            })
    return education[:3]

def extract_certifications(text: str) -> list:
    section = extract_section(text, "certif", ["experience", "projects", "skills", "education"])
    if not section:
        return []
    return [l.strip() for l in section.split("\n") if l.strip()][:6]

def parse_resume(file_path: str, file_type: str) -> dict:
    if file_type == "pdf":
        text = extract_text_from_pdf(file_path)
    elif file_type == "docx":
        text = extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file type")

    return {
        "name": extract_name(text),
        "email": extract_email(text),
        "phone": extract_phone(text),
        "skills": extract_skills(text),
        "projects": extract_projects(text),
        "experience": extract_experience(text),
        "education": extract_education(text),
        "certifications": extract_certifications(text),
        "rawText": text,
    }