mirror of
https://github.com/sismics/docs.git
synced 2025-12-14 02:06:25 +00:00
#118: extract text content from text plain files (WIP)
This commit is contained in:
@@ -64,11 +64,12 @@ public class FileUtil {
|
||||
private static String ocrFile(InputStream inputStream, String language) {
|
||||
Tesseract instance = Tesseract.getInstance();
|
||||
String content = null;
|
||||
BufferedImage image = null;
|
||||
BufferedImage image;
|
||||
try {
|
||||
image = ImageIO.read(inputStream);
|
||||
} catch (IOException e) {
|
||||
log.error("Error reading the image", e);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Upscale and grayscale the image
|
||||
@@ -92,10 +93,9 @@ public class FileUtil {
|
||||
* Save a file on the storage filesystem.
|
||||
*
|
||||
* @param inputStream Unencrypted input stream
|
||||
* @param pdf
|
||||
* @param pdfInputStream PDF input stream
|
||||
* @param file File to save
|
||||
* @param privateKey Private key used for encryption
|
||||
* @throws Exception
|
||||
*/
|
||||
public static void save(InputStream inputStream, InputStream pdfInputStream, File file, String privateKey) throws Exception {
|
||||
Cipher cipher = EncryptionUtil.getEncryptionCipher(privateKey);
|
||||
@@ -114,9 +114,8 @@ public class FileUtil {
|
||||
* @param inputStream Unencrypted input stream
|
||||
* @param pdfInputStream Unencrypted PDF input stream
|
||||
* @param cipher Cipher to use for encryption
|
||||
* @throws Exception
|
||||
*/
|
||||
public static void saveVariations(File file, InputStream inputStream, InputStream pdfInputStream, Cipher cipher) throws Exception {
|
||||
private static void saveVariations(File file, InputStream inputStream, InputStream pdfInputStream, Cipher cipher) throws Exception {
|
||||
BufferedImage image = null;
|
||||
if (ImageUtil.isImage(file.getMimeType())) {
|
||||
image = ImageIO.read(inputStream);
|
||||
@@ -151,7 +150,6 @@ public class FileUtil {
|
||||
* Remove a file from the storage filesystem.
|
||||
*
|
||||
* @param file File to delete
|
||||
* @throws IOException
|
||||
*/
|
||||
public static void delete(File file) throws IOException {
|
||||
Path storedFile = DirectoryUtil.getStorageDirectory().resolve(file.getId());
|
||||
|
||||
@@ -86,7 +86,6 @@ public class PdfUtil {
|
||||
* @param inputStream InputStream
|
||||
* @param reset Reset the stream after usage
|
||||
* @return PDF input stream
|
||||
* @throws Exception
|
||||
*/
|
||||
public static InputStream convertToPdf(File file, InputStream inputStream, boolean reset) throws Exception {
|
||||
if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
|
||||
@@ -101,18 +100,36 @@ public class PdfUtil {
|
||||
if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) {
|
||||
return convertOpenDocumentText(inputStream, reset);
|
||||
}
|
||||
|
||||
|
||||
if (file.getMimeType().equals(MimeType.TEXT_PLAIN) || file.getMimeType().equals(MimeType.TEXT_CSV)) {
|
||||
return convertTextPlain(inputStream, reset);
|
||||
}
|
||||
|
||||
// PDF conversion not necessary/possible
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Convert a text plain document to PDF.
|
||||
*
|
||||
* @param inputStream Unecnrypted input stream
|
||||
* @param reset Reset the stream after usage
|
||||
* @return PDF input stream
|
||||
*/
|
||||
private static InputStream convertTextPlain(InputStream inputStream, boolean reset) throws Exception {
|
||||
if (reset) {
|
||||
inputStream.reset();
|
||||
}
|
||||
// TODO Create a PDF from the text plain
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert an open document text file to PDF.
|
||||
*
|
||||
* @param inputStream Unencrypted input stream
|
||||
* @param reset Reset the stream after usage
|
||||
* @return PDF input stream
|
||||
* @throws Exception
|
||||
*/
|
||||
private static InputStream convertOpenDocumentText(InputStream inputStream, boolean reset) throws Exception {
|
||||
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
|
||||
@@ -131,7 +148,6 @@ public class PdfUtil {
|
||||
* @param inputStream Unencrypted input stream
|
||||
* @param reset Reset the stream after usage
|
||||
* @return PDF input stream
|
||||
* @throws Exception
|
||||
*/
|
||||
private static InputStream convertOfficeDocument(InputStream inputStream, boolean reset) throws Exception {
|
||||
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
|
||||
@@ -153,7 +169,6 @@ public class PdfUtil {
|
||||
* @param metadata Add a page with metadata
|
||||
* @param margin Margins in millimeters
|
||||
* @return PDF input stream
|
||||
* @throws IOException
|
||||
*/
|
||||
public static InputStream convertToPdf(DocumentDto documentDto, List<File> fileList,
|
||||
boolean fitImageToPage, boolean metadata, int margin) throws Exception {
|
||||
@@ -282,7 +297,6 @@ public class PdfUtil {
|
||||
*
|
||||
* @param inputStream PDF document
|
||||
* @return Render of the first page
|
||||
* @throws IOException
|
||||
*/
|
||||
public static BufferedImage renderFirstPage(InputStream inputStream) throws IOException {
|
||||
try (PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
||||
|
||||
@@ -78,22 +78,26 @@ public class MimeTypeUtil {
|
||||
*/
|
||||
public static String getFileExtension(String mimeType) {
|
||||
switch (mimeType) {
|
||||
case MimeType.APPLICATION_ZIP:
|
||||
return "zip";
|
||||
case MimeType.IMAGE_GIF:
|
||||
return "gif";
|
||||
case MimeType.IMAGE_JPEG:
|
||||
return "jpg";
|
||||
case MimeType.IMAGE_PNG:
|
||||
return "png";
|
||||
case MimeType.APPLICATION_PDF:
|
||||
return "pdf";
|
||||
case MimeType.OPEN_DOCUMENT_TEXT:
|
||||
return "odt";
|
||||
case MimeType.OFFICE_DOCUMENT:
|
||||
return "docx";
|
||||
default:
|
||||
return "bin";
|
||||
case MimeType.APPLICATION_ZIP:
|
||||
return "zip";
|
||||
case MimeType.IMAGE_GIF:
|
||||
return "gif";
|
||||
case MimeType.IMAGE_JPEG:
|
||||
return "jpg";
|
||||
case MimeType.IMAGE_PNG:
|
||||
return "png";
|
||||
case MimeType.APPLICATION_PDF:
|
||||
return "pdf";
|
||||
case MimeType.OPEN_DOCUMENT_TEXT:
|
||||
return "odt";
|
||||
case MimeType.OFFICE_DOCUMENT:
|
||||
return "docx";
|
||||
case MimeType.TEXT_PLAIN:
|
||||
return "txt";
|
||||
case MimeType.TEXT_CSV:
|
||||
return "csv";
|
||||
default:
|
||||
return "bin";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user