module internal DataProcessing open System.Text.RegularExpressions open DataCleaning open System open System.IO let MatchText pattern text = Regex.IsMatch(text, pattern) let ConcatToString words = String.concat " " words let GibberishLevelIsValid gibberishLevel = match gibberishLevel with | gibberishLevel when gibberishLevel < 2 || gibberishLevel > 20 -> invalidArg "gibberishLevel" "Invalid argument. Must be between 2 and 20 (inclusive)." | _ -> ignore let SentencesIsValid sentences = match sentences with | sentences when sentences < 1 -> invalidArg "sentences" "Invalid argument. Must be greater than 0." | _ -> ignore let FilePathIsValid filePath = match filePath with | filePath when Path.GetExtension filePath <> ".txt" -> invalidArg "filePath" "Invalid argument. File must be a .txt file." | filePath when not (File.Exists filePath) -> raise (FileNotFoundException("Unable to find the file at the location specified.")) | _ -> ignore let TextContainsValidEndToken (text: string) = match text with | text when text.Contains "." -> true | text when text.Contains "!" -> true | text when text.Contains "?" -> true | _ -> false let SortIntoGroups groupSize text = SplitText @"\s+" text // Splits text where there is a space. |> Seq.windowed groupSize let BisectWords words = let length = Array.length words let start = words |> Seq.take (length - 1) |> ConcatToString (start, words.[length - 1]) let CombineWords prev next = [prev; next] |> List.filter(fun s -> not (String.IsNullOrWhiteSpace s)) |> ConcatToString let ApplyStandardSetup text = text |> ReplaceArtifact "\"" |> ReplaceArtifact "\n\nIn" |> ReplaceArtifact "\r" |> ReplaceArtifact "\n" |> SplitText @"\s+" |> ConcatToString