import {
  getDocument,
  GlobalWorkerOptions,
  version,
} from 'pdfjs-dist/legacy/build/pdf'
import { PDFDocumentProxy } from 'pdfjs-dist/types/src/display/api'

import {
  PdfExtractionDefinition,
  PostProcessor,
} from '../../client/portal-client'
import dayjs from 'dayjs'

import customParseFormat from 'dayjs/plugin/customParseFormat'
dayjs.extend(customParseFormat)

GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${version}/pdf.worker.min.js`

export type PdfExtractionValues = {
  studentId?: string | undefined
  firstName?: string | undefined
  lastName?: string | undefined
  email?: string | undefined
  dateOfBirth?: string | undefined
  expiryDate?: string | undefined
  emailCc?: string | undefined
  emailBcc?: string | undefined
}

export class PdfValueExtractor {
  async extract(
    url: string,
    definitions: Array<PdfExtractionDefinition>
  ): Promise<PdfExtractionValues> {
    const pdfDocument = await getDocument(url).promise
    const allText = await this.getAllText(pdfDocument)

    const values: PdfExtractionValues = {}
    for (const definition of definitions) {
      if (definition.defaultValue) {
        values[definition.key] = definition.defaultValue
      } else {
        const match = new RegExp(definition.regex).exec(allText)
        if (match) {
          const matchedText = definition.matchGroups
            .reduce((acc, val) => {
              if (match[val]) {
                acc += match[val]
              }
              return acc
            }, ' ')
            .trim()
          const postProcessed = this.conditionallyPostProcess(
            definition,
            matchedText
          )
          values[definition.key] = postProcessed
        }
      }
    }

    return this.conditionallyImputeValues(definitions, values)
  }

  private async getAllText(pdfDocument: PDFDocumentProxy): Promise<string> {
    const pages = new Array<Promise<string>>()
    for (let i = 1; i <= pdfDocument.numPages; i++) {
      pages.push(this.getPageText(pdfDocument, i))
    }
    const textPerPage = await Promise.all(pages)
    return textPerPage.join(' ')
  }

  private async getPageText(
    pdfDocument: PDFDocumentProxy,
    pageNum: number
  ): Promise<string> {
    const pdfPage = await pdfDocument.getPage(pageNum)
    const textContent = await pdfPage.getTextContent()

    const textItems = textContent.items
    return textItems
      .reduce((acc: any, val: any) => {
        acc.push(val.str)
        return acc
      }, new Array<string>())
      .join(' ')
  }

  private conditionallyPostProcess(
    definition: PdfExtractionDefinition,
    matchedText: string
  ): string {
    return (
      definition.postProcessors?.reduce(
        (acc: string, postProcessor: PostProcessor) => {
          switch (postProcessor.name) {
            case 'remove-whitespace':
              return acc.replace(/ /g, '')
            case 'replace-space-with-comma':
              return acc.replace(/\s+/g, ',')
            case 'format-date':
              return this.convertDateFormat(
                acc,
                postProcessor.options?.fromFormat
              )
            default:
              return acc
          }
        },
        matchedText
      ) ?? matchedText
    )
  }

  /**
   * For when we need to do some computation only after
   * we have extracted values from the PDF.
   */
  private conditionallyImputeValues(
    definitions: Array<PdfExtractionDefinition>,
    values: PdfExtractionValues
  ): PdfExtractionValues {
    for (const definition of definitions) {
      if (definition.postProcessors) {
        for (const postProcessor of definition.postProcessors) {
          switch (postProcessor.name) {
            case 'remove-firstname-from-lastname':
              if (values['lastName']) {
                values['lastName'] = values['lastName']
                  .replace(values['firstName'] ?? '', '')
                  .trim()
              }
              break
            case 'deduplicate-email':
              if (values['emailCc'] && values['email']) {
                values.emailCc = values.emailCc
                  .trim()
                  .split(',')
                  .filter((item) => item !== values.email)
                  .join(',')
              }
              break
          }
        }
      }
    }
    return values
  }

  private convertDateFormat(date: string, fromFormat = 'DD/MM/YYYY'): string {
    return dayjs(date, fromFormat).format('MM/DD/YYYY')
  }
}
