summaryrefslogtreecommitdiff
path: root/src/inc/DocumentAnalyze.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/inc/DocumentAnalyze.h')
-rw-r--r--src/inc/DocumentAnalyze.h235
1 files changed, 235 insertions, 0 deletions
diff --git a/src/inc/DocumentAnalyze.h b/src/inc/DocumentAnalyze.h
new file mode 100644
index 0000000..8b39239
--- /dev/null
+++ b/src/inc/DocumentAnalyze.h
@@ -0,0 +1,235 @@
+/*
+ *
+ * Copyright (c) 2014
+ * String Algorithms Research Group
+ * Institute of Information Engineering, Chinese Academy of Sciences (IIE-CAS)
+ * National Engineering Laboratory for Information Security Technologies (NELIST)
+ * All rights reserved
+ *
+ * Written by: LIU YANBING ([email protected])
+ LU YUHAI ([email protected])
+
+ * Last modification: 2014-9-22
+ *
+ * This code is the exclusive and proprietary property of IIE-CAS and NELIST.
+ * Usage for direct or indirect commercial advantage is not allowed without
+ * written permission from the authors.
+ *
+ */
+
+#ifndef H_DOCUMENT_ANALYZE_H
+#define H_DOCUMENT_ANALYZE_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#define DOC_PRO_OK 0
+#define DOC_PRO_ERR -1
+
+ /* ��ʽ�ĵ����Ͷ��� */
+ enum DocumentType
+ {
+ DOC_UNKNOWN_TYPE = 0, /* δ֪�ĵ����� */
+
+ DOC_TXT_TYPE, /* ���ı� */
+
+ /* Office 97-2003�ĵ����� */
+ DOC_DOC_TYPE, /* Word 97-2003 */
+ DOC_PPT_TYPE, /* Powerpoint 97-2003 */
+ DOC_XLS_TYPE, /* Excel 97-2003 */
+
+ /* Office 2007�ĵ����� */
+ DOC_DOCX_TYPE, /* Word 2007 */
+ DOC_PPTX_TYPE, /* Powerpoint 2007 */
+ DOC_XLSX_TYPE, /* Excel 2007 */
+
+ /* Open Office�ĵ����� */
+ DOC_ODT_TYPE, /* ODT��ʽ */
+ DOC_ODS_TYPE, /* ODS��ʽ */
+ DOC_ODP_TYPE, /* ODP��ʽ */
+
+ /* iWork�ĵ����� */
+ DOC_PAGES_TYPE, /* iWork Page */
+ DOC_KEY_TYPE, /* iWork Keynote */
+ DOC_NUMBERS_TYPE, /* iWork Numbers */
+
+ /* ���������ĵ����� */
+ DOC_PDF_TYPE, /* pdf��ʽ */
+ DOC_EML_TYPE, /* eml��ʽ */
+ DOC_HTML_TYPE, /* HTML��ʽ */
+ DOC_CHM_TYPE, /* chm��ʽ */
+ DOC_RTF_TYPE, /* rtf��ʽ */
+ DOC_MDB_TYPE, /* Microsoft Access���ݿ��ļ� */
+
+ /* ѹ���ļ���ʽ */
+ DOC_ZIP_TYPE, /* zip��ʽ */
+ DOC_RAR_TYPE, /* rar��ʽ */
+ DOC_GZIP_TYPE, /* gzip��ʽ */
+ DOC_BZIP_TYPE, /* bzip��ʽ */
+ DOC_7Z_TYPE, /* 7z��ʽ */
+ DOC_DEFLATE_TYPE, /* deflate��ʽ */
+
+ /* ͼ���ļ���ʽ */
+ DOC_BMP_TYPE, /* bmp��ʽ */
+ DOC_GIF_TYPE, /* gif��ʽ */
+ DOC_JPEG_TYPE, /* jpeg��ʽ */
+ DOC_RAW_TYPE, /* raw��ʽ */
+ DOC_EMF_TYPE, /* emf��ʽ */
+ DOC_WMF_TYPW, /* wmf��ʽ */
+ DOC_PNG_TYPE, /* png��ʽ */
+ DOC_PCT_TYPE, /* pct��ʽ */
+ DOC_PBM_TYPE, /* pbm��ʽ */
+ DOC_PPM_TYPE, /* ppm��ʽ */
+
+ /* ��Ƶ�ļ���ʽ */
+ DOC_MPEG4_TYPE, /* mpeg4��ʽ */
+ DOC_AV_TYPE, /* av��ʽ */
+ DOC_3GP_TYPE, /* 3gp��ʽ */
+ DOC_ASF_TYPE, /* asf��ʽ */
+ DOC_AVI_TYPE, /* avi��ʽ */
+ DOC_MKV_TYPE, /* mkv��ʽ */
+ DOC_MOV_TYPE, /* mov��ʽ */
+ DOC_MP4_TYPE, /* mp4��ʽ */
+ DOC_RMVB_TYPE, /* rmvb��ʽ */
+ DOC_RM_TYPE, /* rm��ʽ */
+ DOC_WMV_TYPE, /* wmv��ʽ */
+
+ /* ��Ƶ�ļ���ʽ */
+ DOC_MP3_TYPE, /* mp3��ʽ */
+ DOC_WAV_TYPE, /* wav��ʽ */
+ DOC_WMA_TYPE, /* wma��ʽ */
+ DOC_AAC_TYPE, /* aac��ʽ */
+ DOC_OGG_TYPE, /* ogg��ʽ */
+ DOC_APE_TYPE, /* ape��ʽ */
+ DOC_FLAC_TYPE, /* flac��ʽ */
+
+ /* ��ִ���ļ���ʽ */
+ DOC_PE_TYPE, /* Windows��ִ���ļ���ʽ��������exe��dll��vxd��sys��vdm�� */
+ DOC_ELF_TYPE, /* Linux��ִ���ļ���ʽ */
+ };
+
+ /* ����ı��ı������� */
+ enum CodeType
+ {
+ CODE_UNKNOWN_TYPE = 0, /* δ֪���� */
+ CODE_GBK_TYPE, /* GBK */
+ CODE_BIG5_TYPE, /* BIG5 */
+ CODE_UNICODE_TYPE, /* UNICODE */
+ CODE_UTF8_TYPE, /* UTF8 */
+ CODE_UTF7_TYPE, /* UTF7 */
+ };
+
+ /* zip��ѹ���ĵ���eml����Ҫʹ�õ������ĵ����� */
+ enum DocumentSubType
+ {
+ DOC_UNKNOW_SUBTYPE = 0, /* δ֪������ */
+ DOC_FILENAME_SUBTYPE, /* ���ĵ����ļ��� */
+ DOC_CONTENT_SUBTYPE, /* ���ĵ������� */
+ };
+
+ typedef struct _docanalyze_result_t
+ {
+ char * presult; /* ���ݵ����� */
+ int size; /* ���ݵij��� */
+ enum DocumentType doc_type; /* �˶������������ĵ����� */
+ enum DocumentSubType doc_sub_type; /* �˶��������������ĵ����� */
+ enum CodeType code_type; /* �˶����ݵĵ�ǰ�������� */
+ }docanalyze_result_t;
+
+ typedef struct _result_array_t
+ {
+ int result_num; /* ��������Ԫ�ظ��� */
+ docanalyze_result_t * result_buff; /* ������� */
+ }result_array_t;
+
+ typedef void * docanalyze_instance_t;
+ typedef void * docanalyze_streamparam_t;
+
+ /************************��ʽ�ĵ�������غ���*********************************/
+
+ /*
+ ���ܣ�
+ ��ʽ�ĵ�����ʶ�𣬸����ĵ�ͷ����Ϣʶ���ĵ������ͣ������ṩ��ͷ�����ݳ���hlen����64�ֽڡ�
+ ������
+ header : �ļ�ͷ��
+ hlen : �ļ�ͷ���ȣ�
+ ����ֵ��
+ ʶ������ĵ����ͣ��޷�ʶ�𷵻� DOC_UNKNOWN_TYPE��
+ */
+ enum DocumentType docanalyze_identify(const char * header, unsigned int hlen);
+
+ /*
+ ���ܣ�
+ ��ʽ�ĵ��������ij�ʼ�������ݴ��������ĵ����ʹ�����Ӧ���ĵ�������
+ �Լ���Ӧ�Ĺ������ݽṹ
+ ������
+ doc_type[in] ���ĵ����ͣ�
+ max_thread_num [in]: ɨ�����ɲ���ִ�е��߳�����
+ ����ֵ��
+ �ֵ�������ָ�룬��ʧ���򷵻� NULL��
+ */
+ docanalyze_instance_t docanalyze_initialize(unsigned int max_thread_num);
+
+ /*
+ ���ܣ�
+ �ͷ�ÿ���ĵ����͵ķ������Լ���Ӧ�Ĺ������ݽṹ
+ ������
+ instance[in]���ĵ����������
+ ����ֵ��
+ DOC_PRO_OK ��������
+ DOC_PRO_ERR ��ʧ�ܡ�
+ */
+ int docanalyze_destroy(docanalyze_instance_t instance);
+
+ /*
+ ���ܣ�
+ ����һ����ʽ�ֵ����������뱣����״̬�IJ�����
+ ������
+ instance [in]: �ĵ�������ָ�룻
+ thread_id [in]: ��ǰִ���ĵ��������߳�id�������ڷ�Χ[0, max_thread_num-1]֮�ڡ�
+ ����ֵ��
+ ���ر�����״̬�IJ���������ֵΪNULLʱ����ʾʧ�ܡ�
+ */
+ docanalyze_streamparam_t docanalyze_startstream(enum DocumentType doc_type, docanalyze_instance_t instance, unsigned int thread_id);
+
+ /*
+ ���ܣ�
+ ��ʽ�ĵ������ӿڣ�����һ�������ԭʼ�ĵ�����data����״̬��һЩ����м���������stream_param�У�
+ �����Ľ�������res�У��ռ��ɱ������ڲ�����,������������Ҫ����docanalyze_freeresult�ͷ�res���ڴ�ռ䡣
+ ������
+ stream_param [in] : ������״̬�IJ�����
+ data[in] �����������ĵ����ݣ�
+ dlen[in] �����ݳ��ȣ�
+ result_array[out] �������Ľ����
+ ����ֵ��
+ DOC_PRO_OK ������������
+ DOC_PRO_ERR ������ʧ�ܡ�
+ */
+ int docanalyze_parsestream(docanalyze_streamparam_t stream_param, const char * data, unsigned int dlen, result_array_t * result_array);
+
+ /*
+ ���ܣ�
+ �û�ʹ����Ϻ󣬵��ô˺����ͷŽ��result_array���ڴ�ռ�,���������Ҫ��docanalyze_parsestream�ɶ�ʹ��
+ ����ֵ��
+ DOC_PRO_OK ��������
+ DOC_PRO_ERR ��ʧ�ܡ�
+ */
+ int docanalyze_freeresult(result_array_t * result_array);
+
+ /*
+ ���ܣ�������ʽ�������ͷ���������
+ ������
+ stream_param [in]: ������ָ�롣
+ ����ֵ��
+ DOC_PRO_OK ��������
+ DOC_PRO_ERR ��ʧ�ܡ�
+ */
+ int docanalyze_endstream(docanalyze_streamparam_t stream_param);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*end of defined H_DOCUMENT_ANALYZE_H*/ \ No newline at end of file