import { DepositionParser } from "./DepositionParser.ts";
import { Chunk, ChunkWithCount } from "./types.ts";

const PAGES_PER_CHUNK = 6;
const CUTOFF_TOKENS = 2000;
const TOKEN_MULTIPLIER = 1.25;

const CUTOFF_WORD_COUNT = CUTOFF_TOKENS / TOKEN_MULTIPLIER;

export class TextChunker {
  private text: string;
  isDeposition: boolean;
  constructor(text: string) {
    this.text = text;
    this.isDeposition = false;
  }

  getChunks(): Chunk[] {
    const chunks = this.createChunksDeposition();
    const badChunk = chunks.filter(
      (chunk) => chunk.wordCount > CUTOFF_WORD_COUNT
    );
    if (badChunk.length === 0) {
      // TODO: create a more dedicated way to determine if a file is a deposition
      this.isDeposition = true;
      return chunks;
    }
    // else{
    //   throw new Error("Bad chunk found!")
    // }
    this.isDeposition = false;

    return this.createChunksWithNLP();
  }

  createChunksDeposition(): ChunkWithCount[] {
    const parser = new DepositionParser(this.text);
    const pages = [...parser.parse()];
    const chunks: ChunkWithCount[] = [];
    while (pages.length > 0) {
      const groupOfPages = pages.splice(0, PAGES_PER_CHUNK);
      const text = groupOfPages.map((p) => p.text).join(" ");
      chunks.push({
        text,
        sentences: getSentences(text),
        wordCount: countWords(text),
        title: `Pages: ${groupOfPages.map((p) => p.pageNumber).join(", ")}`,
      });
    }

    chunks.forEach((chunk, index) => {
      const nextChunk = chunks[index + 1];
      if (nextChunk === undefined) {
        return;
      }

      const sentences = chunk?.sentences || [];
      if (sentences.length <= 1) {
        return;
      }
      const lastSentence = sentences[sentences.length - 1];
      if (/[\.\?\!]\s*$/.test(lastSentence)) {
        return;
      }

      chunk.text = chunk.text.replace(lastSentence, "");
      nextChunk.text = lastSentence + " " + nextChunk.text;
    });

    return chunks;
  }

  createChunksWithNLP(): Chunk[] {
    const sentences = getSentences(this.text) || [];
    const chunks: string[] = [];
    while (sentences.length > 0) {
      chunks.push(this.getChunk(sentences));
    }
    return chunks.map((chunk, index) => ({
      text: chunk,
      title: `Summary part ${index + 1}`,
    }));
  }

  getChunk(sentences: string[]): string {
    let toSend = "";
    let toSendWordCount = 0;
    let done = false;
    while (!done && sentences.length > 0) {
      const sentence = sentences.shift() || "";
      const wCount = countWords(sentence);
      if (toSendWordCount + wCount > CUTOFF_WORD_COUNT) {
        done = true;
        sentences.unshift(sentence);
      } else {
        toSend += sentence + " ";
        toSendWordCount += wCount;
      }
    }
    return toSend;
  }
}

function countWords(text: string): number {
  const words = text.trim().split(/\s+/);
  return words.length;
}

function getSentences(text: string) {
  return text.match(/[^.?!]+[.!?]+[\])'"`’”]*|.+/g) || [];
}
