#!pip install bs4
# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
from bs4 import BeautifulSoup
import requests
from DefaultPackages import openFile, saveFile
from NER import cleanText
import pandas as pd
class HTML():
  def __init__(self, htmlFile, htmlLink):
    self.htmlLink = htmlLink
    self.htmlFile = htmlFile
  def openHTMLFile(self):
    if self.htmlLink != "None":
      r = requests.get(self.htmlLink)
      soup = BeautifulSoup(r.content, 'html.parser')
    else:
      with open(self.htmlFile) as fp:
        soup = BeautifulSoup(fp, 'html.parser')
    return soup
  def getText(self):
    soup = self.openHTMLFile()
    s = soup.find_all("html")
    for t in range(len(s)):
      text = s[t].get_text()
    cl = cleanText.cleanGenText()
    text = cl.removeExtraSpaceBetweenWords(text)
    return text
  def getListSection(self, scienceDirect=None):
    json = {}
    text = ""
    textJson, textHTML = "",""
    if scienceDirect == None:
      soup = self.openHTMLFile()
      # get list of section
      json = {}
      for h2Pos in range(len(soup.find_all('h2'))):
        if soup.find_all('h2')[h2Pos].text not in json:
          json[soup.find_all('h2')[h2Pos].text] = []
        if h2Pos + 1 < len(soup.find_all('h2')):
          content = soup.find_all('h2')[h2Pos].find_next("p")
          nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
          while content.text != nexth2Content.text:
            json[soup.find_all('h2')[h2Pos].text].append(content.text)
            content = content.find_next("p")
        else:
          content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
          json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
      # format
      '''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
        'Results':[], 'Discussion':[], 'References':[],
        'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
        'Additional information':[], 'Electronic supplementary material':[],
        'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
    if scienceDirect!= None or len(json)==0:
      # Replace with your actual Elsevier API key
      api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
      # ScienceDirect article DOI or PI (Example DOI)
      doi =  self.htmlLink.split("https://doi.org/")[-1]  #"10.1016/j.ajhg.2011.01.009"
      # Base URL for the Elsevier API
      base_url = "https://api.elsevier.com/content/article/doi/"
      # Set headers with API key
      headers = {
          "Accept": "application/json",
          "X-ELS-APIKey": api_key
      }
      # Make the API request
      response = requests.get(base_url + doi, headers=headers)
# Check if the request was successful
      if response.status_code == 200:
        data = response.json()
        supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
        if "originalText" in list(supp_data.keys()):
          if type(supp_data["originalText"])==str:
            json["originalText"] = [supp_data["originalText"]]
          if type(supp_data["originalText"])==dict:
            json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
        else:
          if type(supp_data)==dict:
            for key in supp_data:
              json[key] = [supp_data[key]]

    textJson = self.mergeTextInJson(json)
    textHTML = self.getText()
    if len(textHTML) > len(textJson):
      text = textHTML
    else: text = textJson
    return text #json
  def getReference(self):
    # get reference to collect more next data
    ref = []
    json = self.getListSection()
    for key in json["References"]:
      ct = cleanText.cleanGenText(key)
      cleanText, filteredWord = ct.cleanText()
      if cleanText not in ref:
        ref.append(cleanText)
    return ref
  def getSupMaterial(self):
    # check if there is material or not
    json = {}
    soup = self.openHTMLFile()
    for h2Pos in range(len(soup.find_all('h2'))):
      if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
        #print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
        link, output = [],[]
        if soup.find_all('h2')[h2Pos].text not in json:
          json[soup.find_all('h2')[h2Pos].text] = []
        for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
            link.append(l["href"])
        if h2Pos + 1 < len(soup.find_all('h2')):
          nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
          if nexth2Link in link:
            link = link[:link.index(nexth2Link)]
        # only take links having "https" in that
        for i in link:
          if "https" in i:  output.append(i)
        json[soup.find_all('h2')[h2Pos].text].extend(output)
    return json
  def extractTable(self):
    soup = self.openHTMLFile()
    df = []
    try:
      df = pd.read_html(str(soup))
    except ValueError:
      df = []
      print("No tables found in HTML file")
    return df
  def mergeTextInJson(self,jsonHTML):
    cl = cleanText.cleanGenText()
    #cl = cleanGenText()
    htmlText = ""
    for sec in jsonHTML:
      # section is "\n\n"
      if len(jsonHTML[sec]) > 0:
        for i in range(len(jsonHTML[sec])):
          # same section is just a dot.
          text = jsonHTML[sec][i]
          if len(text)>0:
            #text = cl.removeTabWhiteSpaceNewLine(text)
            #text = cl.removeExtraSpaceBetweenWords(text)
            text, filteredWord = cl.textPreprocessing(text, keepPeriod=True)
          jsonHTML[sec][i] = text
          if i-1 >= 0:
            if len(jsonHTML[sec][i-1])>0:
              if jsonHTML[sec][i-1][-1] != ".":
                htmlText += ". "
          htmlText += jsonHTML[sec][i]
        if len(jsonHTML[sec][i]) > 0:
          if jsonHTML[sec][i][-1]!=".":
            htmlText += "."
        htmlText += "\n\n"
    return htmlText
  def removeHeaders(self):
    pass
  def removeFooters(self):
    pass
  def removeReferences(self):
    pass